Skip to content

Commit

Permalink
Merge pull request #26 from unipept/text-compression
Browse files Browse the repository at this point in the history
Compress protein text using bit packing (from 8 bits per char to 5 bits)
  • Loading branch information
pverscha authored Sep 19, 2024
2 parents e510f5e + 79bee50 commit 4d63609
Show file tree
Hide file tree
Showing 12 changed files with 779 additions and 117 deletions.
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions bitarray/src/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,10 @@ mod tests {
#[test]
fn test_write_binary() {
let mut bitarray = BitArray::with_capacity(4, 40);
bitarray.set(0, 0x1234567890);
bitarray.set(1, 0xabcdef0123);
bitarray.set(2, 0x4567890abc);
bitarray.set(3, 0xdef0123456);
bitarray.set(0, 0x1234567890_u64);
bitarray.set(1, 0xabcdef0123_u64);
bitarray.set(2, 0x4567890abc_u64);
bitarray.set(3, 0xdef0123456_u64);

let mut buffer = Vec::new();
bitarray.write_binary(&mut buffer).unwrap();
Expand Down
18 changes: 11 additions & 7 deletions bitarray/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ impl BitArray {
/// * `index` - The index of the value to set.
/// * `value` - The value to set at the specified index.
pub fn set(&mut self, index: usize, value: u64) {
let value: u64 = value;
let start_block = index * self.bits_per_value / 64;
let start_block_offset = index * self.bits_per_value % 64;

Expand Down Expand Up @@ -142,11 +143,14 @@ impl BitArray {
pub fn clear(&mut self) {
self.data.iter_mut().for_each(|x| *x = 0);
}

pub fn get_data_slice(&self, start_slice: usize, end_slice: usize) -> &[u64] {
&self.data[start_slice..end_slice]
}
}

/// Writes the data to a writer in a binary format using a bit array. This function is helpfull
/// when writing large amounts of data to a writer in chunks. The data is written in chunks of the
/// specified capacity, so memory usage is minimized.
/// Writes the data to a writer in a binary format using a bit array. The data is written
/// in chunks of the specified capacity, so memory usage is minimized.
///
/// # Arguments
///
Expand Down Expand Up @@ -257,10 +261,10 @@ mod tests {
fn test_bitarray_set() {
let mut bitarray = BitArray::with_capacity(4, 40);

bitarray.set(0, 0b0001110011111010110001000111111100110010);
bitarray.set(1, 0b1100001001010010011000010100110111001001);
bitarray.set(2, 0b1111001101001101101101101011101001010001);
bitarray.set(3, 0b0000100010010001010001001110101110011100);
bitarray.set(0, 0b0001110011111010110001000111111100110010_u64);
bitarray.set(1, 0b1100001001010010011000010100110111001001_u64);
bitarray.set(2, 0b1111001101001101101101101011101001010001_u64);
bitarray.set(3, 0b0000100010010001010001001110101110011100_u64);

assert_eq!(bitarray.data, vec![0x1cfac47f32c25261, 0x4dc9f34db6ba5108, 0x9144EB9C00000000]);
}
Expand Down
2 changes: 1 addition & 1 deletion sa-builder/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn main() {
eprintln!();
eprintln!("📋 Started loading the proteins...");
let start_proteins_time = get_time_ms().unwrap();
let mut data = Proteins::try_from_database_file_without_annotations(&database_file)
let mut data = Proteins::try_from_database_file_uncompressed(&database_file)
.unwrap_or_else(|err| eprint_and_exit(err.to_string().as_str()));
eprintln!(
"✅ Successfully loaded the proteins in {} seconds!",
Expand Down
1 change: 1 addition & 0 deletions sa-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ clap = { version = "4.4.8", features = ["derive"] }
rayon = "1.8.1"
serde = { version = "1.0.197", features = ["derive"] }
sa-mappings = { path = "../sa-mappings" }
text-compression = { path = "../text-compression" }
bitarray = { path = "../bitarray" }
serde_json = "1.0.116"
10 changes: 5 additions & 5 deletions sa-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ mod tests {
#[test]
fn test_suffix_array_compressed() {
let mut bitarray = BitArray::with_capacity(5, 40);
bitarray.set(0, 1);
bitarray.set(1, 2);
bitarray.set(2, 3);
bitarray.set(3, 4);
bitarray.set(4, 5);
bitarray.set(0, 1 as u64);
bitarray.set(1, 2 as u64);
bitarray.set(2, 3 as u64);
bitarray.set(3, 4 as u64);
bitarray.set(4, 5 as u64);

let sa = SuffixArray::Compressed(bitarray, 1);
assert_eq!(sa.len(), 5);
Expand Down
Loading

0 comments on commit 4d63609

Please sign in to comment.