-
Notifications
You must be signed in to change notification settings - Fork 175
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEAT] Enable concat for swordfish (#2976)
Implement concat as a streaming sink in swordfish. Concatenating two dataframes in a streaming executor is pretty simple. You just stream the left side, then stream the right side. This PR implements just that. --------- Co-authored-by: Colin Ho <[email protected]> Co-authored-by: Colin Ho <[email protected]>
- Loading branch information
1 parent
31d5412
commit 7d600c2
Showing
7 changed files
with
91 additions
and
95 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,68 @@ | ||
// use std::sync::Arc; | ||
|
||
// use common_error::DaftResult; | ||
// use daft_micropartition::MicroPartition; | ||
// use tracing::instrument; | ||
|
||
// use super::sink::{Sink, SinkResultType}; | ||
|
||
// #[derive(Clone)] | ||
// pub struct ConcatSink { | ||
// result_left: Vec<Arc<MicroPartition>>, | ||
// result_right: Vec<Arc<MicroPartition>>, | ||
// } | ||
|
||
// impl ConcatSink { | ||
// pub fn new() -> Self { | ||
// Self { | ||
// result_left: Vec::new(), | ||
// result_right: Vec::new(), | ||
// } | ||
// } | ||
|
||
// #[instrument(skip_all, name = "ConcatSink::sink")] | ||
// fn sink_left(&mut self, input: &Arc<MicroPartition>) -> DaftResult<SinkResultType> { | ||
// self.result_left.push(input.clone()); | ||
// Ok(SinkResultType::NeedMoreInput) | ||
// } | ||
|
||
// #[instrument(skip_all, name = "ConcatSink::sink")] | ||
// fn sink_right(&mut self, input: &Arc<MicroPartition>) -> DaftResult<SinkResultType> { | ||
// self.result_right.push(input.clone()); | ||
// Ok(SinkResultType::NeedMoreInput) | ||
// } | ||
// } | ||
|
||
// impl Sink for ConcatSink { | ||
// fn sink(&mut self, index: usize, input: &Arc<MicroPartition>) -> DaftResult<SinkResultType> { | ||
// match index { | ||
// 0 => self.sink_left(input), | ||
// 1 => self.sink_right(input), | ||
// _ => panic!("concat only supports 2 inputs, got {index}"), | ||
// } | ||
// } | ||
|
||
// fn in_order(&self) -> bool { | ||
// true | ||
// } | ||
|
||
// fn num_inputs(&self) -> usize { | ||
// 2 | ||
// } | ||
|
||
// #[instrument(skip_all, name = "ConcatSink::finalize")] | ||
// fn finalize(self: Box<Self>) -> DaftResult<Vec<Arc<MicroPartition>>> { | ||
// Ok(self | ||
// .result_left | ||
// .into_iter() | ||
// .chain(self.result_right.into_iter()) | ||
// .collect()) | ||
// } | ||
// } | ||
use std::sync::Arc; | ||
|
||
use common_error::{DaftError, DaftResult}; | ||
use daft_micropartition::MicroPartition; | ||
use tracing::instrument; | ||
|
||
use super::streaming_sink::{StreamingSink, StreamingSinkOutput, StreamingSinkState}; | ||
use crate::pipeline::PipelineResultType; | ||
|
||
struct ConcatSinkState { | ||
// The index of the last morsel of data that was received, which should be strictly non-decreasing. | ||
pub curr_idx: usize, | ||
} | ||
impl StreamingSinkState for ConcatSinkState { | ||
fn as_any_mut(&mut self) -> &mut dyn std::any::Any { | ||
self | ||
} | ||
} | ||
|
||
pub struct ConcatSink {} | ||
|
||
impl StreamingSink for ConcatSink { | ||
/// Execute for the ConcatSink operator does not do any computation and simply returns the input data. | ||
/// It only expects that the indices of the input data are strictly non-decreasing. | ||
/// TODO(Colin): If maintain_order is false, technically we could accept any index. Make this optimization later. | ||
#[instrument(skip_all, name = "ConcatSink::sink")] | ||
fn execute( | ||
&self, | ||
index: usize, | ||
input: &PipelineResultType, | ||
state: &mut dyn StreamingSinkState, | ||
) -> DaftResult<StreamingSinkOutput> { | ||
let state = state | ||
.as_any_mut() | ||
.downcast_mut::<ConcatSinkState>() | ||
.expect("ConcatSink should have ConcatSinkState"); | ||
|
||
// If the index is the same as the current index or one more than the current index, then we can accept the morsel. | ||
if state.curr_idx == index || state.curr_idx + 1 == index { | ||
state.curr_idx = index; | ||
Ok(StreamingSinkOutput::NeedMoreInput(Some( | ||
input.as_data().clone(), | ||
))) | ||
} else { | ||
Err(DaftError::ComputeError(format!("Concat sink received out-of-order data. Expected index to be {} or {}, but got {}.", state.curr_idx, state.curr_idx + 1, index))) | ||
} | ||
} | ||
|
||
fn name(&self) -> &'static str { | ||
"Concat" | ||
} | ||
|
||
fn finalize( | ||
&self, | ||
_states: Vec<Box<dyn StreamingSinkState>>, | ||
) -> DaftResult<Option<Arc<MicroPartition>>> { | ||
Ok(None) | ||
} | ||
|
||
fn make_state(&self) -> Box<dyn StreamingSinkState> { | ||
Box::new(ConcatSinkState { curr_idx: 0 }) | ||
} | ||
|
||
/// Since the ConcatSink does not do any computation, it does not need to spawn multiple workers. | ||
fn max_concurrency(&self) -> usize { | ||
1 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters