diff --git a/cseq/src/elias_fano.rs b/cseq/src/elias_fano.rs index 4a94145..63cdb9c 100644 --- a/cseq/src/elias_fano.rs +++ b/cseq/src/elias_fano.rs @@ -1,6 +1,6 @@ //! Elias-Fano representation of a non-decreasing sequence of integers. -use std::{iter::FusedIterator, io}; +use std::{io, iter::FusedIterator, ops::{Deref, DerefMut}}; use binout::{AsIs, Serializer}; use bitm::{ceiling_div, n_lowest_bits, RankSelect101111, BitAccess, BitVec, CombinedSampling, ConstCombinedSamplingDensity, Rank, Select, Select0, Select0ForRank101111, SelectForRank101111}; @@ -9,8 +9,8 @@ use dyn_size_of::GetSize; /// Builds [`Sequence`] of values added by push methods. /// After adding values in non-decreasing order by [`Self::push`] method, /// [`Self::finish`] can be called to construct [`Sequence`]. -pub struct Builder { - hi: Box<[u64]>, // most significant bits of each item, unary coded +pub struct Builder> { + hi: BV, // most significant bits of each item, unary coded lo: Box<[u64]>, // least significant bits of each item, vector of `bits_per_lo_entry` bit items bits_per_lo: u8, // bit size of each entry in lo current_len: usize, // number of already pushed items @@ -19,7 +19,7 @@ pub struct Builder { universe: u64 // all pushed items must be in range [`0`, `universe`) } -impl Builder { +impl Builder { /// Returns declared *universe*. All pushed items must be in range [0, *universe*). #[inline] pub fn universe(&self) -> u64 { self.universe } @@ -31,18 +31,20 @@ impl Builder { /// Returns value of recently pushed item (if any) or 0. #[inline] pub fn last_pushed(&self) -> u64 { self.last_added } +} +impl Builder { /// Constructs [`Builder`] to build [`Sequence`] with `final_len` values in range [`0`, `universe`). /// After adding values in non-decreasing order by [`Self::push`] method, /// [`Self::finish`] can be called to construct [`Sequence`]. pub fn new(final_len: usize, universe: u64) -> Self { if final_len == 0 || universe == 0 { - return Self { hi: Default::default(), lo: Default::default(), bits_per_lo: 0, current_len: 0, target_len: 0, last_added: 0, universe }; + return Self { hi: BV::with_64bit_segments(0, 0), lo: Default::default(), bits_per_lo: 0, current_len: 0, target_len: 0, last_added: 0, universe }; } let bits_per_lo = (universe / final_len as u64).checked_ilog2().unwrap_or(0) as u8; Self { // adding the last (i.e. (final_len-1)-th) item with value universe-1 sets bit (final_len-1) + ((universe-1) >> bits_per_lo) - hi: Box::with_zeroed_bits(final_len + ((universe-1) >> bits_per_lo) as usize), + hi: BV::with_zeroed_bits(final_len + ((universe-1) >> bits_per_lo) as usize), lo: Box::with_zeroed_bits(1.max(final_len * bits_per_lo as usize)), bits_per_lo, current_len: 0, @@ -51,7 +53,9 @@ impl Builder { universe, } } +} +impl> Builder { /// A version of [`Self::push`] without any checks and panic. pub unsafe fn push_unchecked(&mut self, value: u64) { self.hi.set_bit((value>>self.bits_per_lo) as usize + self.current_len); @@ -89,11 +93,13 @@ impl Builder { pub fn push_diffs>(&mut self, diffs: I) { for diff in diffs { self.push_diff(diff) } } +} +impl> Builder { /// Finishes building and returns [`Sequence`] containing the pushed items and custom select strategy. /// The resulted [`Sequence`] is invalid if not all declared items have been pushed. - pub fn finish_unchecked_s(self) -> Sequence { - Sequence:: { + pub fn finish_unchecked_s(self) -> Sequence { + Sequence:: { hi: self.hi.into(), lo: self.lo, bits_per_lo: self.bits_per_lo, @@ -103,24 +109,27 @@ impl Builder { /// Finishes building and returns [`Sequence`] containing the pushed items and custom select strategy. /// Panics if not all declared items have been pushed. - pub fn finish_s(self) -> Sequence { + pub fn finish_s(self) -> Sequence { assert_eq!(self.current_len, self.target_len, "Cannot finish building Elias-Fano Sequence as the current length ({}) differs from the target ({})", self.current_len, self.target_len); self.finish_unchecked_s::() } /// Finishes building and returns [`Sequence`] containing the pushed items. /// The resulted [`Sequence`] is invalid if not all declared items have been pushed. - #[inline] pub fn finish_unchecked(self) -> Sequence { + #[inline] pub fn finish_unchecked(self) -> Sequence { self.finish_unchecked_s() } /// Finishes building and returns [`Sequence`] containing the pushed items. /// Panics if not all declared items have been pushed. - #[inline] pub fn finish(self) -> Sequence { + #[inline] pub fn finish(self) -> Sequence { self.finish_s() } } +/// Default select strategy for Elias-Fano [`Sequence`]. +pub type DefaultSelectStrategy = CombinedSampling; + /// Elias-Fano representation of a non-decreasing sequence of integers. /// /// By default [`bitm::CombinedSampling`] is used used as both a select and select0 strategy @@ -143,20 +152,22 @@ impl Builder { /// - Daisuke Okanohara, Kunihiko Sadakane, "Practical entropy-compressed rank/select dictionary", /// Proceedings of the Meeting on Algorithm Engineering & Expermiments, January 2007, pages 60–70, /// (Section 6 "SDarrays") -pub struct Sequence, S0 = CombinedSampling> { - hi: RankSelect101111, // most significant bits of each item, unary coded +pub struct Sequence> { + hi: RankSelect101111, // most significant bits of each item, unary coded lo: Box<[u64]>, // least significant bits of each item, vector of `bits_per_lo` bit items bits_per_lo: u8, // bit size of each entry in lo len: usize // number of items } -impl Sequence { +impl Sequence { /// Returns number of stored values. #[inline] pub fn len(&self) -> usize { self.len } /// Returns whether the sequence is empty. #[inline] pub fn is_empty(&self) -> bool { self.len == 0 } +} +impl> Sequence { /// Advance `position` by 1 forward. The result is undefined if `position` is invalid. #[inline] unsafe fn advance_position_unchecked(&self, position: &mut Position) { position.lo += 1; @@ -220,28 +231,28 @@ impl Sequence { } /// Converts `position` to [`Cursor`]. - #[inline] fn cursor(&self, position: Position) -> Cursor<'_, S, S0> { + #[inline] fn cursor(&self, position: Position) -> Cursor { Cursor { sequence: &self, position } } /// Returns iterator over `self` values. - #[inline] pub fn iter(&self) -> Iterator { + #[inline] pub fn iter(&self) -> Iterator { Iterator { sequence: self, begin: self.begin_position(), end: self.end_position() } } /// Returns an iterator that gives the value of the first item followed by /// the differences between the values of subsequent items. - #[inline] pub fn diffs(&self) -> DiffIterator { + #[inline] pub fn diffs(&self) -> DiffIterator { DiffIterator { sequence: self, position: self.begin_position(), prev_value: 0 } } /// Returns cursor that points to the first item of `self`. - #[inline] pub fn begin(&self) -> Cursor { + #[inline] pub fn begin(&self) -> Cursor { self.cursor(self.begin_position()) } /// Returns cursor that points past the end. - #[inline] pub fn end(&self) -> Cursor { + #[inline] pub fn end(&self) -> Cursor { self.cursor(self.end_position()) } @@ -274,14 +285,15 @@ impl Sequence { } } -impl Sequence { +impl+FromIterator> Sequence { /// Reads `self` from the `input`. /// /// Custom select strategies do not have to be the same as the ones used by the written sequence. pub fn read_s(input: &mut dyn io::Read) -> io::Result { let bits_per_lo: u8 = AsIs::read(input)?; - let (hi, len) = RankSelect101111::build(AsIs::read_array(input)?); + let hi: BV = >::read_array_iter(input)?.collect::>()?; + let (hi, len) = RankSelect101111::build(hi); let lo = if bits_per_lo != 0 && len != 0 { AsIs::read_n(input, ceiling_div(len * bits_per_lo as usize, 64))? } else { @@ -289,17 +301,19 @@ impl Sequence { }; Ok(Self { hi, lo, bits_per_lo, len }) } +} +impl> Sequence { /// Constructs [`Sequence`] with custom select strategy and /// filled with elements from the `items` slice, which must be in non-decreasing order. pub fn with_items_from_slice_s + Clone>(items: &[I]) -> Self { - let mut b = Builder::new(items.len(), items.last().map_or(0, |v| v.clone().into()+1)); + let mut b = Builder::::new(items.len(), items.last().map_or(0, |v| v.clone().into()+1)); b.push_all(items.iter().map(|v| v.clone().into())); b.finish_unchecked_s() } } -impl Sequence { +impl> Sequence { /// Returns value at given `index`. The result is undefined if `index` is out of bounds. #[inline] pub unsafe fn get_unchecked(&self, index: usize) -> u64 { (((unsafe{self.hi.select_unchecked(index)} - index) as u64) << self.bits_per_lo) | @@ -347,18 +361,18 @@ impl Sequence { /// Returns valid cursor that points to given `index` of `self`. /// Result is undefined if `index` is out of bounds. - #[inline] pub unsafe fn cursor_at_unchecked(&self, index: usize) -> Cursor { + #[inline] pub unsafe fn cursor_at_unchecked(&self, index: usize) -> Cursor { self.cursor(self.position_at_unchecked(index)) } /// Returns valid cursor that points to given `index` of `self`, /// or [`None`] if `index` is out of bounds. - #[inline] pub unsafe fn cursor_at(&self, index: usize) -> Option> { + #[inline] pub unsafe fn cursor_at(&self, index: usize) -> Option> { (index < self.len).then(|| unsafe { self.cursor_at_unchecked(index) }) } } -impl Sequence { +impl> Sequence { /// Returns the uncorrected position of first `self` item with value greater than or equal to given `value`. /// The `hi` of result may need correction (moving forward to first 1 bit) if it is not an index of 1 bit. /// `lo` is already correct. @@ -392,7 +406,7 @@ impl Sequence { } /// Returns the cursor pointed to the first `self` item with value greater than or equal to given `value`. - #[inline] pub fn geq_cursor(&self, value: u64) -> Cursor { + #[inline] pub fn geq_cursor(&self, value: u64) -> Cursor { self.cursor(self.geq_position(value)) } @@ -402,7 +416,7 @@ impl Sequence { } /// Returns the cursor pointing to the first occurrence of `value` or [`None`] if `self` does not contain `value`. - #[inline] pub fn cursor_of(&self, value: u64) -> Option> { + #[inline] pub fn cursor_of(&self, value: u64) -> Option> { self.position_of(value).map(|position| self.cursor(position)) } @@ -430,9 +444,9 @@ impl GetSize for Sequence where RankSelect101111: GetSize { const USES_DYN_MEM: bool = true; } -impl<'ef, S, S0> IntoIterator for &'ef Sequence { +impl<'ef, S, S0, BV: Deref> IntoIterator for &'ef Sequence { type Item = u64; - type IntoIter = Iterator<'ef, S, S0>; + type IntoIter = Iterator<'ef, S, S0, BV>; #[inline] fn into_iter(self) -> Self::IntoIter { self.iter() } } @@ -449,15 +463,15 @@ impl Position { } /// Iterator over [`Sequence`] values, returned by [`Sequence::iter`] . -pub struct Iterator<'ef, S, S0> { - sequence: &'ef Sequence, +pub struct Iterator<'ef, S, S0, BV> { + sequence: &'ef Sequence, begin: Position, end: Position } -impl Iterator<'_, S, S0> { +impl Iterator<'_, S, S0, BV> { /// Returns the [`Sequence`] over which `self` iterates. - pub fn sequence(&self) -> &Sequence { self.sequence } + pub fn sequence(&self) -> &Sequence { self.sequence } /// Returns index of the value about to return by `next`. pub fn index(&self) -> usize { self.begin.lo } @@ -466,7 +480,7 @@ impl Iterator<'_, S, S0> { pub fn back_index(&self) -> usize { self.begin.lo } } -impl std::iter::Iterator for Iterator<'_, S, S0> { +impl> std::iter::Iterator for Iterator<'_, S, S0, BV> { type Item = u64; fn next(&mut self) -> Option { @@ -474,7 +488,7 @@ impl std::iter::Iterator for Iterator<'_, S, S0> { } } -impl DoubleEndedIterator for Iterator<'_, S, S0> { +impl> DoubleEndedIterator for Iterator<'_, S, S0, BV> { fn next_back(&mut self) -> Option { (self.begin.lo != self.end.lo).then(|| unsafe { self.sequence.advance_position_back_unchecked(&mut self.end); @@ -483,25 +497,25 @@ impl DoubleEndedIterator for Iterator<'_, S, S0> { } } -impl FusedIterator for Iterator<'_, S, S0> {} +impl> FusedIterator for Iterator<'_, S, S0, BV> {} /// Iterator that yields the value of the first item followed by the differences /// between the values of subsequent items of [`Sequence`]. -pub struct DiffIterator<'ef, S, S0> { - sequence: &'ef Sequence, +pub struct DiffIterator<'ef, S, S0, BV> { + sequence: &'ef Sequence, position: Position, prev_value: u64 } -impl DiffIterator<'_, S, S0> { +impl DiffIterator<'_, S, S0, BV> { /// Returns the [`Sequence`] over which `self` iterates. - pub fn sequence(&self) -> &Sequence { self.sequence } + pub fn sequence(&self) -> &Sequence { self.sequence } /// Returns index of the value about to return by `next`. pub fn index(&self) -> usize { self.position.lo } } -impl std::iter::Iterator for DiffIterator<'_, S, S0> { +impl> std::iter::Iterator for DiffIterator<'_, S, S0, BV> { type Item = u64; fn next(&mut self) -> Option { @@ -512,19 +526,19 @@ impl std::iter::Iterator for DiffIterator<'_, S, S0> { } } -impl FusedIterator for DiffIterator<'_, S, S0> {} +impl> FusedIterator for DiffIterator<'_, S, S0, BV> {} /// Points either a position or past the end in Elias-Fano [`Sequence`]. /// It is a kind of iterator over the [`Sequence`]. #[derive(Clone, Copy)] -pub struct Cursor<'ef, S, S0> { - sequence: &'ef Sequence, +pub struct Cursor<'ef, S, S0, BV> { + sequence: &'ef Sequence, position: Position, } -impl Cursor<'_, S, S0> { +impl Cursor<'_, S, S0, BV> { /// Returns the [`Sequence`] in which `self` points the item. - pub fn sequence(&self) -> &Sequence { self.sequence } + pub fn sequence(&self) -> &Sequence { self.sequence } /// Returns whether `self` points is past the end (is invalid). #[inline] pub fn is_end(&self) -> bool { self.position.lo == self.sequence.len } @@ -532,14 +546,16 @@ impl Cursor<'_, S, S0> { /// Returns whether `self` is valid (i.e., not past the end) and thus its value can be obtained. #[inline] pub fn is_valid(&self) -> bool { self.position.lo != self.sequence.len } + /// Returns [`Sequence`] index pointed by `self`, i.e. converts `self` to index. + #[inline] pub fn index(&self) -> usize { self.position.lo } +} + +impl> Cursor<'_, S, S0, BV> { /// Returns value pointed by `self`. Result is undefined if `self` points past the end. #[inline] pub unsafe fn value_unchecked(&self) -> u64 { return self.sequence.value_at_position_unchecked(self.position) } - /// Returns [`Sequence`] index pointed by `self`, i.e. converts `self` to index. - #[inline] pub fn index(&self) -> usize { self.position.lo } - /// Returns value pointed by `self` or [`None`] if it points past the end. #[inline] pub fn value(&self) -> Option { return self.sequence.value_at_position(self.position) @@ -581,7 +597,7 @@ impl Cursor<'_, S, S0> { /// Returns an iterator that gives the the differences between the values of subsequent items, /// starting from `self`. - #[inline] pub fn diffs(&self) -> DiffIterator<'_, S, S0> { + #[inline] pub fn diffs(&self) -> DiffIterator<'_, S, S0, BV> { if self.position.lo == 0 { return self.sequence.diffs(); } let mut prev = self.position; unsafe{self.sequence.advance_position_back_unchecked(&mut prev)}; @@ -589,7 +605,7 @@ impl Cursor<'_, S, S0> { } } -impl std::iter::Iterator for Cursor<'_, S, S0> { +impl> std::iter::Iterator for Cursor<'_, S, S0, BV> { type Item = u64; /// Returns value pointed by `self` and advances it one position forward. @@ -598,7 +614,7 @@ impl std::iter::Iterator for Cursor<'_, S, S0> { } } -impl FusedIterator for Cursor<'_, S, S0> {} +impl> FusedIterator for Cursor<'_, S, S0, BV> {} #[cfg(test)]