From 4ed02be6d9e04d13d955cc57965056798bd85ecd Mon Sep 17 00:00:00 2001 From: Tim Sawicki Date: Thu, 5 Dec 2024 18:06:51 -0500 Subject: [PATCH] bunch of formatting changes around admonitions --- docs/api-reference/application.md | 30 +- docs/api-reference/context.md | 4 +- docs/api-reference/dataframe.md | 60 +- docs/api-reference/kafka.md | 54 +- docs/api-reference/quixstreams.md | 13778 ++++++++-------- docs/api-reference/serialization.md | 60 +- docs/api-reference/sinks.md | 128 +- docs/api-reference/sources.md | 82 +- docs/api-reference/state.md | 26 +- docs/api-reference/topics.md | 48 +- docs/build/README.md | 9 +- docs/tutorials/anomaly-detection/tutorial.md | 33 +- docs/tutorials/purchase-filtering/tutorial.md | 49 +- docs/tutorials/websocket-source/tutorial.md | 43 +- docs/tutorials/word-count/tutorial.md | 71 +- mkdocs.yml | 5 +- 16 files changed, 7287 insertions(+), 7193 deletions(-) diff --git a/docs/api-reference/application.md b/docs/api-reference/application.md index 3c03bf756..2a9414c4c 100644 --- a/docs/api-reference/application.md +++ b/docs/api-reference/application.md @@ -10,7 +10,7 @@ class Application() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L75) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L75) The main Application class. @@ -85,7 +85,7 @@ def __init__(broker_address: Optional[Union[str, ConnectionConfig]] = None, processing_guarantee: ProcessingGuarantee = "at-least-once") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L113)
@@ -174,7 +174,7 @@ instead of the default one. def Quix(cls, *args, **kwargs) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L352) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L352) RAISES EXCEPTION: DEPRECATED. @@ -197,7 +197,7 @@ def topic(name: str, timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L384) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L384) Create a topic definition. @@ -279,7 +279,7 @@ def dataframe(topic: Optional[Topic] = None, source: Optional[BaseSource] = None) -> StreamingDataFrame ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L464) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L464) A simple helper method that generates a `StreamingDataFrame`, which is used @@ -335,7 +335,7 @@ to be used as an input topic. def stop(fail: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L520) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L520) Stop the internal poll loop and the message processing. @@ -362,7 +362,7 @@ to unhandled exception, and it shouldn't commit the current checkpoint. def get_producer() -> Producer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L565) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L565) Create and return a pre-configured Producer instance. The Producer is initialized with params passed to Application. @@ -397,7 +397,7 @@ with app.get_producer() as producer: def get_consumer(auto_commit_enable: bool = True) -> Consumer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L613) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L613) Create and return a pre-configured Consumer instance. @@ -454,7 +454,7 @@ with app.get_consumer() as consumer: def clear_state() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L663) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L663) Clear the state of the application. @@ -468,7 +468,7 @@ Clear the state of the application. def add_source(source: BaseSource, topic: Optional[Topic] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L669) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L669) Add a source to the application. @@ -495,7 +495,7 @@ Note: the names of default topics are prefixed with "source__". def run(dataframe: Optional[StreamingDataFrame] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L700) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L700) Start processing data from Kafka using provided `StreamingDataFrame` @@ -531,7 +531,7 @@ app.run() def setup_topics() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L823) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L823) Validate and create the topics @@ -543,7 +543,7 @@ Validate and create the topics class ApplicationConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L999) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L999) Immutable object holding the application configuration @@ -566,7 +566,7 @@ def settings_customise_sources( ) -> Tuple[PydanticBaseSettingsSource, ...] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L1034) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L1034) Included to ignore reading/setting values from the environment @@ -580,7 +580,7 @@ Included to ignore reading/setting values from the environment def copy(**kwargs) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L1047) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L1047) Update the application config and return a copy diff --git a/docs/api-reference/context.md b/docs/api-reference/context.md index d4a413e96..fbef0ed6f 100644 --- a/docs/api-reference/context.md +++ b/docs/api-reference/context.md @@ -12,7 +12,7 @@ def set_message_context(context: Optional[MessageContext]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/context.py#L22) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/context.py#L22) Set a MessageContext for the current message in the given `contextvars.Context` @@ -55,7 +55,7 @@ sdf = sdf.update(lambda value: alter_context(value)) def message_context() -> Optional[MessageContext] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/context.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/context.py#L53) Get a MessageContext for the current message, which houses most of the message diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 3655dc0b4..253b67912 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -10,7 +10,7 @@ class StreamingDataFrame(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L68) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L68) `StreamingDataFrame` is the main object you will use for ETL work. @@ -81,7 +81,7 @@ def apply(func: Union[ metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L174) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L174) Apply a function to transform the value and return a new value. @@ -139,7 +139,7 @@ def update(func: Union[ metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L263) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L263) Apply a function to mutate value in-place or to perform a side effect @@ -207,7 +207,7 @@ def filter(func: Union[ metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L355) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L355) Filter value using provided function. @@ -259,7 +259,7 @@ def group_by(key: Union[str, Callable[[Any], Any]], key_serializer: Optional[SerializerType] = "json") -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L441) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L441) "Groups" messages by re-keying them via the provided group_by operation @@ -323,7 +323,7 @@ a clone with this operation added (assign to keep its effect). def contains(key: str) -> StreamingSeries ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L514) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L514) Check if the key is present in the Row value. @@ -362,7 +362,7 @@ or False otherwise. def to_topic(topic: Topic, key: Optional[Callable[[Any], Any]] = None) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L539) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L539) Produce current value to a topic. You can optionally specify a new key. @@ -415,7 +415,7 @@ the updated StreamingDataFrame instance (reassignment NOT required). def set_timestamp(func: Callable[[Any, Any, int, Any], int]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L584) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L584) Set a new timestamp based on the current message value and its metadata. @@ -467,7 +467,7 @@ def set_headers( ]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L625) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L625) Set new message headers based on the current message value and metadata. @@ -516,7 +516,7 @@ a new StreamingDataFrame instance def print(pretty: bool = True, metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L676) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L676) Print out the current message value (and optionally, the message metadata) to @@ -572,7 +572,7 @@ def compose( ) -> Dict[str, VoidExecutor] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L718) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L718) Compose all functions of this StreamingDataFrame into one big closure. @@ -626,7 +626,7 @@ def test(value: Any, topic: Optional[Topic] = None) -> List[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L752) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L752) A shorthand to test `StreamingDataFrame` with provided value @@ -663,7 +663,7 @@ def tumbling_window(duration_ms: Union[int, timedelta], name: Optional[str] = None) -> TumblingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L789) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L789) Create a tumbling window transformation on this StreamingDataFrame. @@ -749,7 +749,7 @@ def hopping_window(duration_ms: Union[int, timedelta], name: Optional[str] = None) -> HoppingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L865) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L865) Create a hopping window transformation on this StreamingDataFrame. @@ -843,7 +843,7 @@ def sliding_window(duration_ms: Union[int, timedelta], name: Optional[str] = None) -> SlidingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L957) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L957) Create a sliding window transformation on this StreamingDataFrame. @@ -932,7 +932,7 @@ def drop(columns: Union[str, List[str]], errors: Literal["ignore", "raise"] = "raise") -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L1038) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L1038) Drop column(s) from the message value (value must support `del`, like a dict). @@ -976,7 +976,7 @@ a new StreamingDataFrame instance def sink(sink: BaseSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L1082) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L1082) Sink the processed data to the specified destination. @@ -1006,7 +1006,7 @@ operations, but branches can still be generated from its originating SDF. class StreamingSeries(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L70) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L70) `StreamingSeries` are typically generated by `StreamingDataframes` when getting elements from, or performing certain operations on, a `StreamingDataframe`, @@ -1073,7 +1073,7 @@ def from_apply_callback(cls, func: ApplyWithMetadataCallback, sdf_id: int) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L132) Create a StreamingSeries from a function. @@ -1102,7 +1102,7 @@ instance of `StreamingSeries` def apply(func: ApplyCallback) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L155) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L155) Add a callable to the execution list for this series. @@ -1154,7 +1154,7 @@ a new `StreamingSeries` with the new callable added def compose_returning() -> ReturningExecutor ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L189) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L189) Compose a list of functions from this StreamingSeries and its parents into one @@ -1185,7 +1185,7 @@ def compose( None]] = None) -> VoidExecutor ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L204) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L204) Compose all functions of this StreamingSeries into one big closure. @@ -1243,7 +1243,7 @@ def test(value: Any, ctx: Optional[MessageContext] = None) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L248) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L248) A shorthand to test `StreamingSeries` with provided value @@ -1275,7 +1275,7 @@ result of `StreamingSeries` def isin(other: Container) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L304) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L304) Check if series value is in "other". @@ -1320,7 +1320,7 @@ new StreamingSeries def contains(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L331) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L331) Check if series value contains "other" @@ -1365,7 +1365,7 @@ new StreamingSeries def is_(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L356) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L356) Check if series value refers to the same object as `other` @@ -1407,7 +1407,7 @@ new StreamingSeries def isnot(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L379) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L379) Check if series value does not refer to the same object as `other` @@ -1450,7 +1450,7 @@ new StreamingSeries def isnull() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L403) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L403) Check if series value is None. @@ -1487,7 +1487,7 @@ new StreamingSeries def notnull() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L426) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L426) Check if series value is not None. @@ -1524,7 +1524,7 @@ new StreamingSeries def abs() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L449) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L449) Get absolute value of the series value. diff --git a/docs/api-reference/kafka.md b/docs/api-reference/kafka.md index f74c7ef96..d3c97367e 100644 --- a/docs/api-reference/kafka.md +++ b/docs/api-reference/kafka.md @@ -10,7 +10,7 @@ class Producer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L42) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L42) @@ -26,7 +26,7 @@ def __init__(broker_address: Union[str, ConnectionConfig], flush_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L43) A wrapper around `confluent_kafka.Producer`. @@ -66,7 +66,7 @@ def produce(topic: str, on_delivery: Optional[DeliveryCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L81) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L81) Produce a message to a topic. @@ -101,7 +101,7 @@ for the produced message. def poll(timeout: float = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L142) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L142) Polls the producer for events and calls `on_delivery` callbacks. @@ -122,7 +122,7 @@ Polls the producer for events and calls `on_delivery` callbacks. def flush(timeout: Optional[float] = None) -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L150) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L150) Wait for all messages in the Producer queue to be delivered. @@ -147,7 +147,7 @@ number of messages remaining to flush class TransactionalProducer(Producer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L181) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L181) A separate producer class used only internally for transactions (transactions are only needed when using a consumer). @@ -164,7 +164,7 @@ A separate producer class used only internally for transactions class BaseConsumer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L68) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L68) @@ -184,7 +184,7 @@ def __init__(broker_address: Union[str, ConnectionConfig], extra_config: Optional[dict] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L69) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L69) A wrapper around `confluent_kafka.Consumer`. @@ -227,7 +227,7 @@ Note: values passed as arguments override values in `extra_config`. def poll(timeout: Optional[float] = None) -> Optional[Message] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L132) Consumes a single message, calls callbacks and returns events. @@ -265,7 +265,7 @@ event or callback. None or -1 is infinite. Default: None. def unsubscribe() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L235) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L235) Remove current subscription. @@ -285,7 +285,7 @@ def store_offsets(message: Optional[Message] = None, offsets: Optional[List[TopicPartition]] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L244) Store offsets for a message or a list of offsets. @@ -318,7 +318,7 @@ def commit(message: Optional[Message] = None, asynchronous: bool = True) -> Optional[List[TopicPartition]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L275) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L275) Commit a message or a list of offsets. @@ -356,7 +356,7 @@ def committed(partitions: List[TopicPartition], timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L316) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L316) Retrieve committed offsets for the specified partitions. @@ -391,7 +391,7 @@ def get_watermark_offsets(partition: TopicPartition, cached: bool = False) -> Tuple[int, int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L334) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L334) Retrieve low and high offsets for the specified partition. @@ -430,7 +430,7 @@ def list_topics(topic: Optional[str] = None, timeout: Optional[float] = None) -> ClusterMetadata ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L360) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L360) Request metadata from the cluster. @@ -462,7 +462,7 @@ None or -1 is infinite. Default: None def memberid() -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L381) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L381) Return this client's broker-assigned group member id. @@ -490,7 +490,7 @@ def offsets_for_times(partitions: List[TopicPartition], timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L394) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L394) Look up offsets by timestamp for the specified partitions. @@ -529,7 +529,7 @@ None or -1 is infinite. Default: None def pause(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L420) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L420) Pause consumption for the provided list of partitions. @@ -557,7 +557,7 @@ Does NOT affect the result of `Consumer.assignment()`. def resume(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L433) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L433) Resume consumption for the provided list of partitions. @@ -581,7 +581,7 @@ Resume consumption for the provided list of partitions. def position(partitions: List[TopicPartition]) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L443) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L443) Retrieve current positions (offsets) for the specified partitions. @@ -614,7 +614,7 @@ the last consumed message + 1. def seek(partition: TopicPartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L457) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L457) Set consume position for partition to offset. @@ -646,7 +646,7 @@ pass the offset in an `assign()` call. def assignment() -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L474) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L474) Returns the current partition assignment. @@ -671,7 +671,7 @@ Returns the current partition assignment. def set_sasl_credentials(username: str, password: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L487) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L487) Sets the SASL credentials used for this client. @@ -698,7 +698,7 @@ This method is applicable only to SASL PLAIN and SCRAM mechanisms. def incremental_assign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L501) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L501) Assign new partitions. @@ -724,7 +724,7 @@ Any additional partitions besides the ones passed during the `Consumer` def incremental_unassign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L515) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L515) Revoke partitions. @@ -746,7 +746,7 @@ Can be called outside an on_revoke callback. def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L525) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L525) Close down and terminate the Kafka Consumer. @@ -769,7 +769,7 @@ see `poll()` for more info. def consumer_group_metadata() -> GroupMetadata ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L542) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L542) Used by the producer during consumer offset sending for an EOS transaction. diff --git a/docs/api-reference/quixstreams.md b/docs/api-reference/quixstreams.md index 6826ada4b..fe50458a9 100644 --- a/docs/api-reference/quixstreams.md +++ b/docs/api-reference/quixstreams.md @@ -2,7069 +2,6871 @@ ## quixstreams - - -## quixstreams.logging + - +## quixstreams.core -#### configure\_logging + -```python -def configure_logging(loglevel: Optional[Union[int, LogLevel]], - name: str = LOGGER_NAME, - pid: bool = False) -> bool -``` +## quixstreams.core.stream.functions.base -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/logging.py#L24) + -Configure "quixstreams" logger. +### StreamFunction ->***NOTE:*** If "quixstreams" logger already has pre-defined handlers -(e.g. logging has already been configured via `logging`, or the function -is called twice), it will skip configuration and return `False`. +```python +class StreamFunction(abc.ABC) +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/base.py#L10) -- `loglevel`: a valid log level as a string or None. -If None passed, this function is no-op and no logging will be configured. -- `name`: the log name included in the output -- `pid`: if True include the process PID in the logs +A base class for all the streaming operations in Quix Streams. -**Returns**: +It provides a `get_executor` method to return a closure to be called with the input +values. -True if logging config has been updated, otherwise False. + - +#### StreamFunction.get\_executor -## quixstreams.error\_callbacks +```python +@abc.abstractmethod +def get_executor(*child_executors: VoidExecutor) -> VoidExecutor +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/base.py#L24) -## quixstreams.platforms +Returns a wrapper to be called on a value, key, timestamp and headers. - + -## quixstreams.platforms.quix.config +## quixstreams.core.stream.functions.transform - + -#### strip\_workspace\_id\_prefix +### TransformFunction ```python -def strip_workspace_id_prefix(workspace_id: str, s: str) -> str +class TransformFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L46) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/transform.py#L9) -Remove the workspace ID from a given string if it starts with it. +Wrap a function into a "Transform" function. -Only used for consumer groups. +The provided callback must accept a value, a key and a timestamp. +It's expected to return a new value, new key and new timestamp. -**Arguments**: +This function must be used with caution, because it can technically change the +key. +It's supposed to be used by the library internals and not be a part of the public +API. -- `workspace_id`: the workspace id -- `s`: the string to append to +The result of the callback will always be passed downstream. -**Returns**: + -the string with workspace_id prefix removed +## quixstreams.core.stream.functions.update - + -#### prepend\_workspace\_id +### UpdateFunction ```python -def prepend_workspace_id(workspace_id: str, s: str) -> str +class UpdateFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L59) - -Add the workspace ID as a prefix to a given string if it does not have it. - -Only used for consumer groups. - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/update.py#L9) -- `workspace_id`: the workspace id -- `s`: the string to append to +Wrap a function into an "Update" function. -**Returns**: +The provided function must accept a value, and it's expected to mutate it +or to perform some side effect. -the string with workspace_id prepended +The result of the callback is always ignored, and the original input is passed +downstream. - + -### QuixApplicationConfig +### UpdateWithMetadataFunction ```python -@dataclasses.dataclass -class QuixApplicationConfig() +class UpdateWithMetadataFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L73) - -A convenience container class for Quix Application configs. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/update.py#L34) -### QuixKafkaConfigsBuilder +Wrap a function into an "Update" function. -```python -class QuixKafkaConfigsBuilder() -``` +The provided function must accept a value, a key, and a timestamp. +The callback is expected to mutate the value or to perform some side effect with it. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L83) +The result of the callback is always ignored, and the original input is passed +downstream. -Retrieves all the necessary information from the Quix API and builds all the -objects required to connect a confluent-kafka client to the Quix Platform. + -If not executed within the Quix platform directly, you must provide a Quix -"streaming" (aka "sdk") token, or Personal Access Token. +## quixstreams.core.stream.functions -Ideally you also know your workspace name or id. If not, you can search for it -using a known topic name, but note the search space is limited to the access level -of your token. + -It also currently handles the app_auto_create_topics setting for Quix Applications. +## quixstreams.core.stream.functions.apply - + -#### QuixKafkaConfigsBuilder.\_\_init\_\_ +### ApplyFunction ```python -def __init__(quix_sdk_token: Optional[str] = None, - workspace_id: Optional[str] = None, - quix_portal_api_service: Optional[QuixPortalApiService] = None, - timeout: float = 30, - topic_create_timeout: float = 60) +class ApplyFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L99) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/apply.py#L9) -**Arguments**: +Wrap a function into "Apply" function. -- `quix_portal_api_service`: A QuixPortalApiService instance (else generated) -- `workspace_id`: A valid Quix Workspace ID (else searched for) +The provided callback is expected to return a new value based on input, +and its result will always be passed downstream. - + -#### QuixKafkaConfigsBuilder.convert\_topic\_response +### ApplyWithMetadataFunction ```python -@classmethod -def convert_topic_response(cls, api_response: dict) -> Topic +class ApplyWithMetadataFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L178) - -Converts a GET or POST ("create") topic API response to a Topic object +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/apply.py#L51) -**Arguments**: +Wrap a function into "Apply" function. -- `api_response`: the dict response from a get or create topic call +The provided function is expected to accept value, and timestamp and return +a new value based on input, +and its result will always be passed downstream. -**Returns**: + -a corresponding Topic object +## quixstreams.core.stream.functions.filter - + -#### QuixKafkaConfigsBuilder.strip\_workspace\_id\_prefix +### FilterFunction ```python -def strip_workspace_id_prefix(s: str) -> str +class FilterFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L206) - -Remove the workspace ID from a given string if it starts with it. - -Only used for consumer groups. - -**Arguments**: - -- `s`: the string to append to - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/filter.py#L9) -the string with workspace_id prefix removed +Wraps a function into a "Filter" function. +The result of a Filter function is interpreted as boolean. +If it's `True`, the input will be return downstream. +If it's `False`, the `Filtered` exception will be raised to signal that the +value is filtered out. - + -#### QuixKafkaConfigsBuilder.prepend\_workspace\_id +### FilterWithMetadataFunction ```python -def prepend_workspace_id(s: str) -> str +class FilterWithMetadataFunction(StreamFunction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L217) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/filter.py#L32) -Add the workspace ID as a prefix to a given string if it does not have it. +Wraps a function into a "Filter" function. -Only used for consumer groups. +The passed callback must accept value, key, and timestamp, and it's expected to +return a boolean-like result. -**Arguments**: +If the result is `True`, the input will be passed downstream. +Otherwise, the value will be filtered out. -- `s`: the string to append to + -**Returns**: +## quixstreams.core.stream.functions.types -the string with workspace_id prepended + - +## quixstreams.core.stream.functions.utils -#### QuixKafkaConfigsBuilder.search\_for\_workspace + + +#### pickle\_copier ```python -def search_for_workspace(workspace_name_or_id: Optional[str] = None, - timeout: Optional[float] = None) -> Optional[dict] +def pickle_copier(obj: T) -> Callable[[], T] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L228) - -Search for a workspace given an expected workspace name or id. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/functions/utils.py#L12) -**Arguments**: +A utility function to copy objects using a "pickle" library. -- `workspace_name_or_id`: the expected name or id of a workspace -- `timeout`: response timeout (seconds); Default 30 +On average, it's faster than "copy.deepcopy". +It accepts an object and returns a callable creating copies of this object. -**Returns**: +**Arguments**: -the workspace data dict if search success, else None +- `obj`: an object to copy - + -#### QuixKafkaConfigsBuilder.get\_workspace\_info +## quixstreams.core.stream -```python -def get_workspace_info(known_workspace_topic: Optional[str] = None, - timeout: Optional[float] = None) -> dict -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L270) +## quixstreams.core.stream.stream -Queries for workspace data from the Quix API, regardless of instance cache, + -and updates instance attributes from query result. +### Stream -**Arguments**: +```python +class Stream() +``` -- `known_workspace_topic`: a topic you know to exist in some workspace -- `timeout`: response timeout (seconds); Default 30 +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L38) - + -#### QuixKafkaConfigsBuilder.search\_workspace\_for\_topic +#### Stream.\_\_init\_\_ ```python -def search_workspace_for_topic( - workspace_id: str, - topic: str, - timeout: Optional[float] = None) -> Optional[str] +def __init__(func: Optional[StreamFunction] = None, + parent: Optional[Self] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L298) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L39) -Search through all the topics in the given workspace id to see if there is a - -match with the provided topic. +A base class for all streaming operations. -**Arguments**: +`Stream` is an abstraction of a function pipeline. +Each Stream has a function and a parent (None by default). +When adding new function to the stream, it creates a new `Stream` object and +sets "parent" to the previous `Stream` to maintain an order of execution. -- `workspace_id`: the workspace to search in -- `topic`: the topic to search for -- `timeout`: response timeout (seconds); Default 30 +Streams supports four types of functions: -**Returns**: +- "Apply" - generate new values based on a previous one. + The result of an Apply function is passed downstream to the next functions. + If "expand=True" is passed and the function returns an `Iterable`, + each item of it will be treated as a separate value downstream. +- "Update" - update values in-place. + The result of an Update function is always ignored, and its input is passed + downstream. +- "Filter" - to filter values from the Stream. + The result of a Filter function is interpreted as boolean. + If it's `True`, the input will be passed downstream. + If it's `False`, the record will be filtered from the stream. +- "Transform" - to transform keys and timestamps along with the values. + "Transform" functions may change the keys and should be used with caution. + The result of the Transform function is passed downstream to the next + functions. + If "expand=True" is passed and the function returns an `Iterable`, + each item of it will be treated as a separate value downstream. -the workspace_id if success, else None +To execute the functions on the `Stream`, call `.compose()` method, and +it will return a closure to execute all the functions accumulated in the Stream +and its parents. - +**Arguments**: -#### QuixKafkaConfigsBuilder.search\_for\_topic\_workspace +- `func`: a function to be called on the stream. +It is expected to be wrapped into one of "Apply", "Filter", "Update" or +"Trasform" from `quixstreams.core.stream.functions` package. +Default - "ApplyFunction(lambda value: value)". +- `parent`: a parent `Stream` + + + +#### Stream.add\_filter ```python -def search_for_topic_workspace(topic: str, - timeout: Optional[float] = None - ) -> Optional[dict] +def add_filter(func: Union[FilterCallback, FilterWithMetadataCallback], + *, + metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L319) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L104) -Find what workspace a topic belongs to. +Add a function to filter values from the Stream. -If there is only one workspace altogether, it is assumed to be the workspace. -More than one means each workspace will be searched until the first hit. +The return value of the function will be interpreted as `bool`. +If the function returns `False`-like result, the Stream will raise `Filtered` +exception during execution. **Arguments**: -- `topic`: the topic to search for -- `timeout`: response timeout (seconds); Default 30 +- `func`: a function to filter values from the stream +- `metadata`: if True, the callback will receive key and timestamp along with +the value. +Default - `False`. **Returns**: -workspace data dict if topic search success, else None +a new `Stream` derived from the current one - + -#### QuixKafkaConfigsBuilder.create\_topic +#### Stream.add\_apply ```python -def create_topic(topic: Topic, timeout: Optional[float] = None) +def add_apply(func: Union[ + ApplyCallback, + ApplyExpandedCallback, + ApplyWithMetadataCallback, + ApplyWithMetadataExpandedCallback, +], + *, + expand: bool = False, + metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L348) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L129) -The actual API call to create the topic. +Add an "apply" function to the Stream. + +The function is supposed to return a new value, which will be passed +further during execution. **Arguments**: -- `topic`: a Topic instance -- `timeout`: response timeout (seconds); Default 30 +- `func`: a function to generate a new value +- `expand`: if True, expand the returned iterable into individual values +downstream. If returned value is not iterable, `TypeError` will be raised. +Default - `False`. +- `metadata`: if True, the callback will receive key and timestamp along with +the value. +Default - `False`. - +**Returns**: -#### QuixKafkaConfigsBuilder.get\_or\_create\_topic +a new `Stream` derived from the current one + + + +#### Stream.add\_update ```python -def get_or_create_topic(topic: Topic, timeout: Optional[float] = None) -> dict +def add_update(func: Union[UpdateCallback, UpdateWithMetadataCallback], + *, + metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L378) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L162) -Get or create topics in a Quix cluster as part of initializing the Topic +Add an "update" function to the Stream, that will mutate the input value. -object to obtain the true topic name. +The return of this function will be ignored and its input +will be passed downstream. **Arguments**: -- `topic`: a `Topic` object -- `timeout`: response timeout (seconds); Default 30 -marked as "Ready" (and thus ready to produce to/consume from). +- `func`: a function to mutate the value +- `metadata`: if True, the callback will receive key and timestamp along with +the value. +Default - `False`. - +**Returns**: -#### QuixKafkaConfigsBuilder.wait\_for\_topic\_ready\_statuses +a new Stream derived from the current one + + + +#### Stream.add\_transform ```python -def wait_for_topic_ready_statuses(topics: List[Topic], - timeout: Optional[float] = None, - finalize_timeout: Optional[float] = None) +def add_transform(func: Union[TransformCallback, TransformExpandedCallback], + *, + expand: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L404) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L186) -After the broker acknowledges topics for creation, they will be in a +Add a "transform" function to the Stream, that will mutate the input value. -"Creating" status; they not usable until they are set to a status of "Ready". +The callback must accept a value, a key, and a timestamp. +It's expected to return a new value, new key and new timestamp. -This blocks until all topics are marked as "Ready" or the timeout is hit. +The result of the callback which will be passed downstream +during execution. **Arguments**: -- `topics`: a list of `Topic` objects -- `timeout`: response timeout (seconds); Default 30 -- `finalize_timeout`: topic finalization timeout (seconds); Default 60 -marked as "Ready" (and thus ready to produce to/consume from). +- `func`: a function to mutate the value +- `expand`: if True, expand the returned iterable into individual items +downstream. If returned value is not iterable, `TypeError` will be raised. +Default - `False`. - +**Returns**: -#### QuixKafkaConfigsBuilder.get\_topic +a new Stream derived from the current one + + + +#### Stream.diff ```python -def get_topic(topic_name: str, timeout: Optional[float] = None) -> dict +def diff(other: "Stream") -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L447) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L211) -return the topic ID (the actual cluster topic name) if it exists, else raise +Takes the difference between Streams `self` and `other` based on their last + +common parent, and returns a new, independent `Stream` that includes only +this difference (the start of the "diff" will have no parent). + +It's impossible to calculate a diff when: + - Streams don't have a common parent. + - When the `self` Stream already includes all the nodes from + the `other` Stream, and the resulting diff is empty. **Arguments**: -- `topic_name`: name of the topic -- `timeout`: response timeout (seconds); Default 30 +- `other`: a `Stream` to take a diff from. **Raises**: -- `QuixApiRequestFailure`: when topic does not exist +- `ValueError`: if Streams don't have a common parent, +if the diff is empty, or pruning failed. **Returns**: -response dict of the topic info if topic found, else None +a new independent `Stream` instance whose root begins at the diff - + -#### QuixKafkaConfigsBuilder.get\_application\_config +#### Stream.root\_path ```python -def get_application_config(consumer_group_id: str) -> QuixApplicationConfig +def root_path(allow_splits=True) -> List[Self] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/config.py#L479) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L272) -Get all the necessary attributes for an Application to run on Quix Cloud. +Return a list of all parent Streams including the node itself. -**Arguments**: +Can optionally stop at a first encountered split with allow_splits=False -- `consumer_group_id`: consumer group id, if needed +The tree is ordered from parent to child (current node comes last). **Returns**: -a QuixApplicationConfig instance - - - -## quixstreams.platforms.quix.env +a list of `Stream` objects - + -### QuixEnvironment +#### Stream.full\_tree ```python -class QuixEnvironment() +def full_tree() -> List[Self] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/env.py#L7) - -Class to access various Quix platform environment settings +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L293) - +Starts at tree root and finds every Stream in the tree (including splits). -#### SDK\_TOKEN +**Returns**: -noqa: S105 +The collection of all Streams interconnected to this one - + -#### QuixEnvironment.state\_management\_enabled +#### Stream.compose ```python -@property -def state_management_enabled() -> bool +def compose( + allow_filters=True, + allow_expands=True, + allow_updates=True, + allow_transforms=True, + sink: Optional[Callable[[Any, Any, int, Any], + None]] = None) -> VoidExecutor ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/env.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L300) -Check whether "State management" is enabled for the current deployment +Generate an "executor" closure by mapping all relatives of this `Stream` and -**Returns**: +composing their functions together. -True if state management is enabled, otherwise False +The resulting "executor" can be called with a given +value, key, timestamp, and headers (i.e. a Kafka message). - +By default, executor doesn't return the result of the execution. +To accumulate the results, pass the `sink` parameter. -#### QuixEnvironment.deployment\_id +**Arguments**: -```python -@property -def deployment_id() -> Optional[str] -``` +- `allow_filters`: If False, this function will fail with `ValueError` if +the stream has filter functions in the tree. Default - True. +- `allow_updates`: If False, this function will fail with `ValueError` if +the stream has update functions in the tree. Default - True. +- `allow_expands`: If False, this function will fail with `ValueError` if +the stream has functions with "expand=True" in the tree. Default - True. +- `allow_transforms`: If False, this function will fail with `ValueError` if +the stream has transform functions in the tree. Default - True. +- `sink`: callable to accumulate the results of the execution, optional. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/env.py#L27) + -Return current Quix deployment id. +#### Stream.compose\_returning -This variable is meant to be set only by Quix Platform and only -when the application is deployed. +```python +def compose_returning() -> ReturningExecutor +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/core/stream/stream.py#L357) -deployment id or None +Compose a list of functions from this `Stream` and its parents into one +big closure that always returns the transformed record. - +This closure is to be used to execute the functions in the stream and to get +the result of the transformations. -#### QuixEnvironment.workspace\_id +Stream may only contain simple "apply" functions to be able to compose itself +into a returning function. + + + +## quixstreams.dataframe.utils + + + +#### ensure\_milliseconds ```python -@property -def workspace_id() -> Optional[str] +def ensure_milliseconds(delta: Union[int, timedelta]) -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/env.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/utils.py#L5) -Return Quix workspace id if set +Convert timedelta to milliseconds. -**Returns**: +If the `delta` is not +This function will also round the value to the closest milliseconds in case of +higher precision. -workspace id or None +**Arguments**: - +- `delta`: `timedelta` object -#### QuixEnvironment.portal\_api +**Returns**: -```python -@property -def portal_api() -> Optional[str] -``` +timedelta value in milliseconds as `int` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/env.py#L47) + -Return Quix Portal API url if set +## quixstreams.dataframe.windows -**Returns**: + -portal API URL or None +## quixstreams.dataframe.windows.base - + -#### QuixEnvironment.state\_dir +#### get\_window\_ranges ```python -@property -def state_dir() -> str +def get_window_ranges(timestamp_ms: int, + duration_ms: int, + step_ms: Optional[int] = None) -> Deque[Tuple[int, int]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/env.py#L56) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/base.py#L17) -Return application state directory on Quix. +Get a list of window ranges for the given timestamp. + +**Arguments**: + +- `timestamp_ms`: timestamp in milliseconds +- `duration_ms`: window duration in milliseconds +- `step_ms`: window step in milliseconds for hopping windows, optional. **Returns**: -path to state dir +a list of (, ) tuples - + -## quixstreams.platforms.quix.checks +## quixstreams.dataframe.windows.definitions - + -#### check\_state\_management\_enabled +### FixedTimeWindowDefinition ```python -def check_state_management_enabled() +class FixedTimeWindowDefinition(abc.ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/checks.py#L11) - -Check if State Management feature is enabled for the current deployment on -Quix platform. -If it's disabled, the exception will be raised. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L18) - + -#### check\_state\_dir +#### FixedTimeWindowDefinition.sum ```python -def check_state_dir(state_dir: Path) +def sum() -> "FixedTimeWindow" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/checks.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L66) -Check if Application "state_dir" matches the state dir on Quix platform. - -If it doesn't match, the warning will be logged. - -**Arguments**: - -- `state_dir`: application state_dir path - - +Configure the window to aggregate data by summing up values within -## quixstreams.platforms.quix +each window period. - +**Returns**: -## quixstreams.platforms.quix.api +an instance of `FixedTimeWindow` configured to perform sum aggregation. - + -### QuixPortalApiService +#### FixedTimeWindowDefinition.count ```python -class QuixPortalApiService() +def count() -> "FixedTimeWindow" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/api.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L81) -A light wrapper around the Quix Portal Api. If used in the Quix Platform, it will -use that workspaces auth token and portal endpoint, else you must provide it. +Configure the window to aggregate data by counting the number of values -Function names closely reflect the respective API endpoint, -each starting with the method [GET, POST, etc.] followed by the endpoint path. +within each window period. -Results will be returned in the form of request's Response.json(), unless something -else is required. Non-200's will raise exceptions. +**Returns**: -See the swagger documentation for more info about the endpoints. +an instance of `FixedTimeWindow` configured to perform record count. - + -#### QuixPortalApiService.get\_workspace\_certificate +#### FixedTimeWindowDefinition.mean ```python -def get_workspace_certificate(workspace_id: Optional[str] = None, - timeout: float = 30) -> Optional[bytes] +def mean() -> "FixedTimeWindow" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/api.py#L119) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L96) -Get a workspace TLS certificate if available. - -Returns `None` if certificate is not specified. - -**Arguments**: +Configure the window to aggregate data by calculating the mean of the values -- `workspace_id`: workspace id, optional -- `timeout`: request timeout; Default 30 +within each window period. **Returns**: -certificate as bytes if present, or None - - - -## quixstreams.platforms.quix.exceptions - - - -## quixstreams.platforms.quix.topic\_manager +an instance of `FixedTimeWindow` configured to calculate the mean +of the values. - + -### QuixTopicManager +#### FixedTimeWindowDefinition.reduce ```python -class QuixTopicManager(TopicManager) +def reduce(reducer: Callable[[Any, Any], Any], + initializer: Callable[[Any], Any]) -> "FixedTimeWindow" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/topic_manager.py#L10) - -The source of all topic management with quixstreams. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L116) -This is specifically for Applications using the Quix Cloud. +Configure the window to perform a custom aggregation using `reducer` -Generally initialized and managed automatically by a Quix Application, -but allows a user to work with it directly when needed, such as using it alongside -a plain `Producer` to create its topics. +and `initializer` functions. -See methods for details. +Example Snippet: +```python +sdf = StreamingDataFrame(...) - +# Using "reduce()" to calculate multiple aggregates at once +def reducer(agg: dict, current: int): + aggregated = { + 'min': min(agg['min'], current), + 'max': max(agg['max'], current), + 'count': agg['count'] + 1 + } + return aggregated -#### QuixTopicManager.\_\_init\_\_ +def initializer(current) -> dict: + return {'min': current, 'max': current, 'count': 1} -```python -def __init__(topic_admin: TopicAdmin, - consumer_group: str, - quix_config_builder: QuixKafkaConfigsBuilder, - timeout: float = 30, - create_timeout: float = 60, - auto_create_topics: bool = True) +window = ( + sdf.tumbling_window(duration_ms=1000) + .reduce(reducer=reducer, initializer=initializer) + .final() +) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/platforms/quix/topic_manager.py#L31) - **Arguments**: -- `topic_admin`: an `Admin` instance -- `quix_config_builder`: A QuixKafkaConfigsBuilder instance, else one is -generated for you. -- `timeout`: response timeout (seconds) -- `create_timeout`: timeout for topic creation +- `reducer`: A function that takes two arguments +(the accumulated value and a new value) and returns a single value. +The returned value will be saved to the state store and sent downstream. +- `initializer`: A function to call for every first element of the window. +This function is used to initialize the aggregation within a window. - +**Returns**: -## quixstreams.dataframe.registry +A window configured to perform custom reduce aggregation on the data. - + -### DataframeRegistry +#### FixedTimeWindowDefinition.max ```python -class DataframeRegistry() +def max() -> "FixedTimeWindow" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/registry.py#L16) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L162) -Helps manage multiple `StreamingDataFrames` (multi-topic `Applications`) -and their respective repartitions. +Configure a window to aggregate the maximum value within each window period. -`SDF`s are registered by storing their topic and current Stream. +**Returns**: - +an instance of `FixedTimeWindow` configured to calculate the maximum +value within each window period. -#### DataframeRegistry.consumer\_topics + + +#### FixedTimeWindowDefinition.min ```python -@property -def consumer_topics() -> List[Topic] +def min() -> "FixedTimeWindow" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/registry.py#L31) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/definitions.py#L177) -**Returns**: +Configure a window to aggregate the minimum value within each window period. -a list of Topics a consumer should subscribe to. +**Returns**: - +an instance of `FixedTimeWindow` configured to calculate the maximum +value within each window period. -#### DataframeRegistry.register\_root + -```python -def register_root(new_sdf: "StreamingDataFrame") -``` +## quixstreams.dataframe.windows.sliding -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/registry.py#L37) + -Register a "root" SDF, or the start of a topic's processing. +### SlidingWindow -**Arguments**: +```python +class SlidingWindow(FixedTimeWindow) +``` -- `new_sdf`: the new SDF. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/sliding.py#L9) - + -#### DataframeRegistry.register\_groupby +#### SlidingWindow.process\_window ```python -def register_groupby(source_sdf: "StreamingDataFrame", - new_sdf: "StreamingDataFrame") +def process_window( + value: Any, timestamp_ms: int, state: WindowedState +) -> tuple[Iterable[WindowResult], Iterable[WindowResult]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/registry.py#L52) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/sliding.py#L10) -Register a "groupby" SDF, which is one generated with `SDF.group_by()`. +The algorithm is based on the concept that each message +is associated with a left and a right window. -**Arguments**: +Left Window: +- Begins at message timestamp - window size +- Ends at message timestamp -- `source_sdf`: the SDF used by `sdf.group_by()` -- `new_sdf`: the SDF generated by `sdf.group_by()`. +Right Window: +- Begins at message timestamp + 1 ms +- Ends at message timestamp + 1 ms + window size - +For example, for a window size of 10 and a message A arriving at timestamp 26: -#### DataframeRegistry.compose\_all + 0 10 20 30 40 50 60 +----|---------|---------|---------|---------|---------|---------|---> + A +left window -> |---------||---------| <- right window + 16 26 27 37 + +The algorithm scans backward through the window store: +- Starting at: start_time = message timestamp + 1 ms (the right window's start time) +- Ending at: start_time = message timestamp - 2 * window size + +During this traversal, the algorithm performs the following actions: + +1. Determine if the right window should be created. + If yes, locate the existing aggregation to copy to the new window. +2. Determine if the right window of the previous record should be created. + If yes, locate the existing aggregation and combine it with the incoming message. +3. Locate and update the left window if it exists. +4. If the left window does not exist, create it. Locate the existing + aggregation and combine it with the incoming message. +5. Locate and update all existing windows to which the new message belongs. + + + +## quixstreams.dataframe.windows.time\_based + + + +### FixedTimeWindow ```python -def compose_all( - sink: Optional[Callable[[Any, Any, int, Any], None]] = None -) -> Dict[str, VoidExecutor] +class FixedTimeWindow() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/registry.py#L75) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/time_based.py#L42) -Composes all the Streams and returns them in a dict, where key is its topic. + -**Arguments**: +#### FixedTimeWindow.final -- `sink`: callable to accumulate the results of the execution, optional. +```python +def final() -> "StreamingDataFrame" +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/time_based.py#L129) -a {topic_name: composed} dict, where composed is a callable +Apply the window aggregation and return results only when the windows are +closed. - +The format of returned windows: +```python +{ + "start": , + "end": , + "value: , +} +``` -## quixstreams.dataframe.dataframe +The individual window is closed when the event time +(the maximum observed timestamp across the partition) passes +its end timestamp + grace period. +The closed windows cannot receive updates anymore and are considered final. - +>***NOTE:*** Windows can be closed only within the same message key. +If some message keys appear irregularly in the stream, the latest windows +can remain unprocessed until the message the same key is received. -### StreamingDataFrame + + +#### FixedTimeWindow.current ```python -class StreamingDataFrame(BaseStreaming) +def current() -> "StreamingDataFrame" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L68) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/windows/time_based.py#L167) -`StreamingDataFrame` is the main object you will use for ETL work. - -Typically created with an `app = quixstreams.app.Application()` instance, -via `sdf = app.dataframe()`. +Apply the window transformation to the StreamingDataFrame to return results +for each updated window. +The format of returned windows: +```python +{ + "start": , + "end": , + "value: , +} +``` -What it Does: +This method processes streaming data and returns results as they come, +regardless of whether the window is closed or not. -- Builds a data processing pipeline, declaratively (not executed immediately) - - Executes this pipeline on inputs at runtime (Kafka message values) -- Provides functions/interface similar to Pandas Dataframes/Series -- Enables stateful processing (and manages everything related to it) + +## quixstreams.dataframe -How to Use: + -Define various operations while continuously reassigning to itself (or new fields). +## quixstreams.dataframe.base -These operations will generally transform your data, access/update state, or produce -to kafka topics. + -We recommend your data structure to be "columnar" (aka a dict/JSON) in nature so -that it works with the entire interface, but simple types like `ints`, `str`, etc. -are also supported. +## quixstreams.dataframe.exceptions -See the various methods and classes for more specifics, or for a deep dive into -usage, see `streamingdataframe.md` under the `docs/` folder. + ->***NOTE:*** column referencing like `sdf["a_column"]` and various methods often - create other object types (typically `quixstreams.dataframe.StreamingSeries`), - which is expected; type hinting should alert you to any issues should you - attempt invalid operations with said objects (however, we cannot infer whether - an operation is valid with respect to your data!). +## quixstreams.dataframe.registry + -Example Snippet: +### DataframeRegistry ```python -sdf = StreamingDataFrame() -sdf = sdf.apply(a_func) -sdf = sdf.filter(another_func) -sdf = sdf.to_topic(topic_obj) +class DataframeRegistry() ``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/registry.py#L16) -#### StreamingDataFrame.apply +Helps manage multiple `StreamingDataFrames` (multi-topic `Applications`) +and their respective repartitions. + +`SDF`s are registered by storing their topic and current Stream. + + + +#### DataframeRegistry.consumer\_topics ```python -def apply(func: Union[ - ApplyCallback, - ApplyCallbackStateful, - ApplyWithMetadataCallback, - ApplyWithMetadataCallbackStateful, -], - *, - stateful: bool = False, - expand: bool = False, - metadata: bool = False) -> Self +@property +def consumer_topics() -> List[Topic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L174) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/registry.py#L31) -Apply a function to transform the value and return a new value. +**Returns**: -The result will be passed downstream as an input value. +a list of Topics a consumer should subscribe to. + -Example Snippet: +#### DataframeRegistry.register\_root ```python -# This stores a string in state and capitalizes every column with a string value. -# A second apply then keeps only the string value columns (shows non-stateful). -def func(d: dict, state: State): - value = d["store_field"] - if value != state.get("my_store_key"): - state.set("my_store_key") = value - return {k: v.upper() if isinstance(v, str) else v for k, v in d.items()} +def register_root(new_sdf: "StreamingDataFrame") +``` -sdf = StreamingDataFrame() -sdf = sdf.apply(func, stateful=True) -sdf = sdf.apply(lambda d: {k: v for k,v in d.items() if isinstance(v, str)}) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/registry.py#L37) -``` +Register a "root" SDF, or the start of a topic's processing. **Arguments**: -- `func`: a function to apply -- `stateful`: if `True`, the function will be provided with a second argument -of type `State` to perform stateful operations. -- `expand`: if True, expand the returned iterable into individual values -downstream. If returned value is not iterable, `TypeError` will be raised. -Default - `False`. -- `metadata`: if True, the callback will receive key, timestamp and headers -along with the value. -Default - `False`. +- `new_sdf`: the new SDF. - + -#### StreamingDataFrame.update +#### DataframeRegistry.register\_groupby ```python -def update(func: Union[ - UpdateCallback, - UpdateCallbackStateful, - UpdateWithMetadataCallback, - UpdateWithMetadataCallbackStateful, -], - *, - stateful: bool = False, - metadata: bool = False) -> Self +def register_groupby(source_sdf: "StreamingDataFrame", + new_sdf: "StreamingDataFrame") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L263) - -Apply a function to mutate value in-place or to perform a side effect +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/registry.py#L52) -(e.g., printing a value to the console). +Register a "groupby" SDF, which is one generated with `SDF.group_by()`. -The result of the function will be ignored, and the original value will be -passed downstream. +**Arguments**: -This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the -original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). +- `source_sdf`: the SDF used by `sdf.group_by()` +- `new_sdf`: the SDF generated by `sdf.group_by()`. + -Example Snippet: +#### DataframeRegistry.compose\_all ```python -# Stores a value and mutates a list by appending a new item to it. -# Also prints to console. +def compose_all( + sink: Optional[Callable[[Any, Any, int, Any], None]] = None +) -> Dict[str, VoidExecutor] +``` -def func(values: list, state: State): - value = values[0] - if value != state.get("my_store_key"): - state.set("my_store_key") = value - values.append("new_item") +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/registry.py#L75) -sdf = StreamingDataFrame() -sdf = sdf.update(func, stateful=True) -# does not require reassigning -sdf.update(lambda v: v.append(1)) -``` +Composes all the Streams and returns them in a dict, where key is its topic. **Arguments**: -- `func`: function to update value -- `stateful`: if `True`, the function will be provided with a second argument -of type `State` to perform stateful operations. -- `metadata`: if True, the callback will receive key, timestamp and headers -along with the value. -Default - `False`. +- `sink`: callable to accumulate the results of the execution, optional. **Returns**: -the updated StreamingDataFrame instance (reassignment NOT required). +a {topic_name: composed} dict, where composed is a callable - + -#### StreamingDataFrame.filter +## quixstreams.dataframe.series + + + +### StreamingSeries ```python -def filter(func: Union[ - FilterCallback, - FilterCallbackStateful, - FilterWithMetadataCallback, - FilterWithMetadataCallbackStateful, -], - *, - stateful: bool = False, - metadata: bool = False) -> Self +class StreamingSeries(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L355) - -Filter value using provided function. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L70) -If the function returns True-like value, the original value will be -passed downstream. - -Example Snippet: +`StreamingSeries` are typically generated by `StreamingDataframes` when getting +elements from, or performing certain operations on, a `StreamingDataframe`, +thus acting as a representation of "column" value. -```python -# Stores a value and allows further processing only if the value is greater than -# what was previously stored. +They share some operations with the `StreamingDataframe`, but also provide some +additional functionality. -def func(d: dict, state: State): - value = d["my_value"] - if value > state.get("my_store_key"): - state.set("my_store_key") = value - return True - return False +Most column value operations are handled by this class, and `StreamingSeries` can +generate other `StreamingSeries` as a result of said operations. -sdf = StreamingDataFrame() -sdf = sdf.filter(func, stateful=True) -``` -**Arguments**: +What it Does: -- `func`: function to filter value -- `stateful`: if `True`, the function will be provided with second argument -of type `State` to perform stateful operations. -- `metadata`: if True, the callback will receive key, timestamp and headers -along with the value. -Default - `False`. +- Allows ways to do simple operations with dataframe "column"/dictionary values: + - Basic ops like add, subtract, modulo, etc. +- Enables comparisons/inequalities: + - Greater than, equals, etc. + - and/or, is/not operations +- Can check for existence of columns in `StreamingDataFrames` +- Enables chaining of various operations together - -#### StreamingDataFrame.group\_by +How to Use: -```python -def group_by(key: Union[str, Callable[[Any], Any]], - name: Optional[str] = None, - value_deserializer: Optional[DeserializerType] = "json", - key_deserializer: Optional[DeserializerType] = "json", - value_serializer: Optional[SerializerType] = "json", - key_serializer: Optional[SerializerType] = "json") -> Self -``` +For the most part, you may not even notice this class exists! +They will naturally be created as a result of typical `StreamingDataFrame` use. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L441) +Auto-complete should help you with valid methods and type-checking should alert +you to invalid operations between `StreamingSeries`. -"Groups" messages by re-keying them via the provided group_by operation +In general, any typical Pands dataframe operation between columns should be valid +with `StreamingSeries`, and you shouldn't have to think about them explicitly. -on their message values. -This enables things like aggregations on messages with non-matching keys. +Example Snippet: -You can provide a column name (uses the column's value) or a custom function -to generate this new key. +```python +# Random methods for example purposes. More detailed explanations found under +# various methods or in the docs folder. -`.groupby()` can only be performed once per `StreamingDataFrame` instance. +sdf = StreamingDataFrame() +sdf = sdf["column_a"].apply(a_func).apply(diff_func, stateful=True) +sdf["my_new_bool_field"] = sdf["column_b"].contains("this_string") +sdf["new_sum_field"] = sdf["column_c"] + sdf["column_d"] + 2 +sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)] +``` ->**NOTE:** group_by generates a topic that copies the original topic's settings. + -Example Snippet: +#### StreamingSeries.from\_apply\_callback ```python -# We have customer purchase events where the message key is the "store_id", -# but we want to calculate sales per customer (by "customer_account_id"). +@classmethod +def from_apply_callback(cls, func: ApplyWithMetadataCallback, + sdf_id: int) -> Self +``` -def func(d: dict, state: State): - current_total = state.get("customer_sum", 0) - new_total = current_total + d["customer_spent"] - state.set("customer_sum", new_total) - d["customer_total"] = new_total - return d +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L132) -sdf = StreamingDataFrame() -sdf = sdf.group_by("customer_account_id") -sdf = sdf.apply(func, stateful=True) -``` +Create a StreamingSeries from a function. + +The provided function will be wrapped into `Apply` **Arguments**: -- `key`: how the new key should be generated from the message value; -requires a column name (string) or a callable that takes the message value. -- `name`: a name for the op (must be unique per group-by), required if `key` -is a custom callable. -- `value_deserializer`: a deserializer type for values; default - JSON -- `key_deserializer`: a deserializer type for keys; default - JSON -- `value_serializer`: a serializer type for values; default - JSON -- `key_serializer`: a serializer type for keys; default - JSON +- `func`: a function to apply +- `sdf_id`: the id of the calling `SDF`. **Returns**: -a clone with this operation added (assign to keep its effect). +instance of `StreamingSeries` - + -#### StreamingDataFrame.contains +#### StreamingSeries.apply ```python -def contains(key: str) -> StreamingSeries +def apply(func: ApplyCallback) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L514) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L155) + +Add a callable to the execution list for this series. + +The provided callable should accept a single argument, which will be its input. +The provided callable should similarly return one output, or None + +They can be chained together or included with other operations. -Check if the key is present in the Row value. Example Snippet: ```python -# Add new column 'has_column' which contains a boolean indicating -# the presence of 'column_x' +# The `StreamingSeries` are generated when `sdf["COLUMN_NAME"]` is called. +# This stores a string in state and capitalizes the column value; the result is +# assigned to a new column. +# Another apply converts a str column to an int, assigning it to a new column. + +def func(value: str, state: State): + if value != state.get("my_store_key"): + state.set("my_store_key") = value + return v.upper() sdf = StreamingDataFrame() -sdf['has_column'] = sdf.contains('column_x') +sdf["new_col"] = sdf["a_column"]["nested_dict_key"].apply(func, stateful=True) +sdf["new_col_2"] = sdf["str_col"].apply(lambda v: int(v)) + sdf["str_col2"] + 2 ``` **Arguments**: -- `key`: a column name to check. +- `func`: a callable with one argument and one output **Returns**: -a Column object that evaluates to True if the key is present -or False otherwise. +a new `StreamingSeries` with the new callable added - + -#### StreamingDataFrame.to\_topic +#### StreamingSeries.compose\_returning ```python -def to_topic(topic: Topic, key: Optional[Callable[[Any], Any]] = None) -> Self +def compose_returning() -> ReturningExecutor ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L539) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L189) -Produce current value to a topic. You can optionally specify a new key. - -This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the -original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). - -Example Snippet: - -```python -from quixstreams import Application - -# Produce to two different topics, changing the key for one of them. - -app = Application() -input_topic = app.topic("input_x") -output_topic_0 = app.topic("output_a") -output_topic_1 = app.topic("output_b") +Compose a list of functions from this StreamingSeries and its parents into one -sdf = app.dataframe(input_topic) -sdf = sdf.to_topic(output_topic_0) -# does not require reassigning -sdf.to_topic(output_topic_1, key=lambda data: data["a_field"]) -``` +big closure that always returns the transformed record. -**Arguments**: +This closure is to be used to execute the functions in the stream and to get +the result of the transformations. -- `topic`: instance of `Topic` -- `key`: a callable to generate a new message key, optional. -If passed, the return type of this callable must be serializable -by `key_serializer` defined for this Topic object. -By default, the current message key will be used. +Stream may only contain simple "apply" functions to be able to compose itself +into a returning function. **Returns**: -the updated StreamingDataFrame instance (reassignment NOT required). +a callable accepting value, key and timestamp and +returning a tuple "(value, key, timestamp) - + -#### StreamingDataFrame.set\_timestamp +#### StreamingSeries.compose ```python -def set_timestamp(func: Callable[[Any, Any, int, Any], int]) -> Self +def compose( + sink: Optional[Callable[[Any, Any, int, Any], + None]] = None) -> VoidExecutor ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L584) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L204) -Set a new timestamp based on the current message value and its metadata. +Compose all functions of this StreamingSeries into one big closure. -The new timestamp will be used in windowed aggregations and when producing -messages to the output topics. +Generally not required by users; the `quixstreams.app.Application` class will +do this automatically. -The new timestamp must be in milliseconds to conform Kafka requirements. Example Snippet: ```python from quixstreams import Application +app = Application(...) -app = Application() -input_topic = app.topic("data") +sdf = app.dataframe() +sdf = sdf["column_a"].apply(apply_func) +sdf = sdf["column_b"].contains(filter_func) +sdf = sdf.compose() -sdf = app.dataframe(input_topic) -# Updating the record's timestamp based on the value -sdf = sdf.set_timestamp(lambda value, key, timestamp, headers: value['new_timestamp']) +result_0 = sdf({"my": "record"}) +result_1 = sdf({"other": "record"}) ``` **Arguments**: -- `func`: callable accepting the current value, key, timestamp, and headers. -It's expected to return a new timestamp as integer in milliseconds. +- `sink`: callable to accumulate the results of the execution. + +**Raises**: + +- `ValueError`: if disallowed functions are present in the tree of +underlying `Stream`. **Returns**: -a new StreamingDataFrame instance +a callable accepting value, key and timestamp and +returning None - + -#### StreamingDataFrame.set\_headers +#### StreamingSeries.test ```python -def set_headers( - func: Callable[ - [Any, Any, int, HeadersTuples], - HeadersTuples, - ]) -> Self +def test(value: Any, + key: Any, + timestamp: int, + headers: Optional[Any] = None, + ctx: Optional[MessageContext] = None) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L625) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L248) -Set new message headers based on the current message value and metadata. +A shorthand to test `StreamingSeries` with provided value -The new headers will be used when producing messages to the output topics. +and `MessageContext`. -The provided callback must accept value, key, timestamp, and headers, -and return a new collection of (header, value) tuples. +**Arguments**: -Example Snippet: +- `value`: value to pass through `StreamingSeries` +- `ctx`: instance of `MessageContext`, optional. +Provide it if the StreamingSeries instance has +functions calling `get_current_key()`. +Default - `None`. -```python -from quixstreams import Application +**Returns**: +result of `StreamingSeries` -app = Application() -input_topic = app.topic("data") + -sdf = app.dataframe(input_topic) -# Updating the record's headers based on the value and metadata -sdf = sdf.set_headers(lambda value, key, timestamp, headers: [('id', value['id'])]) -``` +#### StreamingSeries.isin + +```python +def isin(other: Container) -> Self +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L304) + +Check if series value is in "other". + +Same as "StreamingSeries in other". + +Runtime result will be a `bool`. + + +Example Snippet: + +```python +from quixstreams import Application + +# Check if "str_column" is contained in a column with a list of strings and +# assign the resulting `bool` to a new column: "has_my_str". + +sdf = app.dataframe() +sdf["has_my_str"] = sdf["str_column"].isin(sdf["column_with_list_of_strs"]) +``` **Arguments**: -- `func`: callable accepting the current value, key, timestamp, and headers. -It's expected to return a new set of headers -as a collection of (header, value) tuples. +- `other`: a container to check **Returns**: -a new StreamingDataFrame instance +new StreamingSeries - + -#### StreamingDataFrame.print +#### StreamingSeries.contains ```python -def print(pretty: bool = True, metadata: bool = False) -> Self +def contains(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L676) - -Print out the current message value (and optionally, the message metadata) to +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L331) -stdout (console) (like the built-in `print` function). +Check if series value contains "other" -Can also output a more dict-friendly format with `pretty=True`. +Same as "other in StreamingSeries". -This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the -original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). +Runtime result will be a `bool`. -> NOTE: prints the current (edited) values, not the original values. Example Snippet: ```python from quixstreams import Application +# Check if "column_a" contains "my_substring" and assign the resulting +# `bool` to a new column: "has_my_substr" -app = Application() -input_topic = app.topic("data") - -sdf = app.dataframe(input_topic) -sdf["edited_col"] = sdf["orig_col"] + "edited" -# print the updated message value with the newly added column -sdf.print() +sdf = app.dataframe() +sdf["has_my_substr"] = sdf["column_a"].contains("my_substring") ``` **Arguments**: -- `pretty`: Whether to use "pprint" formatting, which uses new-lines and -indents for easier console reading (but might be worse for log parsing). -- `metadata`: Whether to additionally print the key, timestamp, and headers +- `other`: object to check **Returns**: -the updated StreamingDataFrame instance (reassignment NOT required). +new StreamingSeries - + -#### StreamingDataFrame.compose +#### StreamingSeries.is\_ ```python -def compose( - sink: Optional[Callable[[Any, Any, int, Any], None]] = None -) -> Dict[str, VoidExecutor] +def is_(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L718) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L356) -Compose all functions of this StreamingDataFrame into one big closure. - -Closures are more performant than calling all the functions in the -`StreamingDataFrame` one-by-one. +Check if series value refers to the same object as `other` -Generally not required by users; the `quixstreams.app.Application` class will -do this automatically. +Runtime result will be a `bool`. Example Snippet: ```python +# Check if "column_a" is the same as "column_b" and assign the resulting `bool` +# to a new column: "is_same" + from quixstreams import Application sdf = app.dataframe() -sdf = sdf.apply(apply_func) -sdf = sdf.filter(filter_func) -sdf = sdf.compose() - -result_0 = sdf({"my": "record"}) -result_1 = sdf({"other": "record"}) +sdf["is_same"] = sdf["column_a"].is_(sdf["column_b"]) ``` **Arguments**: -- `sink`: callable to accumulate the results of the execution, optional. +- `other`: object to check for "is" **Returns**: -a function that accepts "value" -and returns a result of StreamingDataFrame +new StreamingSeries - + -#### StreamingDataFrame.test +#### StreamingSeries.isnot ```python -def test(value: Any, - key: Any, - timestamp: int, - headers: Optional[Any] = None, - ctx: Optional[MessageContext] = None, - topic: Optional[Topic] = None) -> List[Any] +def isnot(other: Union[Self, object]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L752) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L379) -A shorthand to test `StreamingDataFrame` with provided value +Check if series value does not refer to the same object as `other` -and `MessageContext`. +Runtime result will be a `bool`. + + +Example Snippet: + +```python +from quixstreams import Application + +# Check if "column_a" is the same as "column_b" and assign the resulting `bool` +# to a new column: "is_not_same" + +sdf = app.dataframe() +sdf["is_not_same"] = sdf["column_a"].isnot(sdf["column_b"]) +``` **Arguments**: -- `value`: value to pass through `StreamingDataFrame` -- `key`: key to pass through `StreamingDataFrame` -- `timestamp`: timestamp to pass through `StreamingDataFrame` -- `ctx`: instance of `MessageContext`, optional. -Provide it if the StreamingDataFrame instance calls `to_topic()`, -has stateful functions or windows. -Default - `None`. -- `topic`: optionally, a topic branch to test with +- `other`: object to check for "is_not" **Returns**: -result of `StreamingDataFrame` +new StreamingSeries - + -#### StreamingDataFrame.tumbling\_window +#### StreamingSeries.isnull ```python -def tumbling_window(duration_ms: Union[int, timedelta], - grace_ms: Union[int, timedelta] = 0, - name: Optional[str] = None) -> TumblingWindowDefinition +def isnull() -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L789) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L403) -Create a tumbling window transformation on this StreamingDataFrame. +Check if series value is None. -Tumbling windows divide time into fixed-sized, non-overlapping windows. +Runtime result will be a `bool`. -They allow performing stateful aggregations like `sum`, `reduce`, etc. -on top of the data and emit results downstream. -Notes: +Example Snippet: -- The timestamp of the aggregation result is set to the window start timestamp. -- Every window is grouped by the current Kafka message key. -- Messages with `None` key will be ignored. -- The time windows always use the current event time. +```python +from quixstreams import Application +# Check if "column_a" is null and assign the resulting `bool` to a new column: +# "is_null" +sdf = app.dataframe() +sdf["is_null"] = sdf["column_a"].isnull() +``` -Example Snippet: +**Returns**: -```python -app = Application() -sdf = app.dataframe(...) +new StreamingSeries -sdf = ( - # Define a tumbling window of 60s and grace period of 10s - sdf.tumbling_window( - duration_ms=timedelta(seconds=60), grace_ms=timedelta(seconds=10.0) - ) + - # Specify the aggregation function - .sum() +#### StreamingSeries.notnull - # Specify how the results should be emitted downstream. - # "current()" will emit results as they come for each updated window, - # possibly producing multiple messages per key-window pair - # "final()" will emit windows only when they are closed and cannot - # receive any updates anymore. - .current() -) +```python +def notnull() -> Self ``` -**Arguments**: - -- `duration_ms`: The length of each window. -Can be specified as either an `int` representing milliseconds or a -`timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `grace_ms`: The grace period for data arrival. -It allows late-arriving data (data arriving after the window -has theoretically closed) to be included in the window. -Can be specified as either an `int` representing milliseconds -or as a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `name`: The unique identifier for the window. If not provided, it will be -automatically generated based on the window's properties. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L426) -**Returns**: +Check if series value is not None. -`TumblingWindowDefinition` instance representing the tumbling window -configuration. -This object can be further configured with aggregation functions -like `sum`, `count`, etc. applied to the StreamingDataFrame. +Runtime result will be a `bool`. - -#### StreamingDataFrame.hopping\_window +Example Snippet: ```python -def hopping_window(duration_ms: Union[int, timedelta], - step_ms: Union[int, timedelta], - grace_ms: Union[int, timedelta] = 0, - name: Optional[str] = None) -> HoppingWindowDefinition +from quixstreams import Application + +# Check if "column_a" is not null and assign the resulting `bool` to a new column: +# "is_not_null" + +sdf = app.dataframe() +sdf["is_not_null"] = sdf["column_a"].notnull() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L865) +**Returns**: -Create a hopping window transformation on this StreamingDataFrame. +new StreamingSeries -Hopping windows divide the data stream into overlapping windows based on time. -The overlap is controlled by the `step_ms` parameter. + -They allow performing stateful aggregations like `sum`, `reduce`, etc. -on top of the data and emit results downstream. +#### StreamingSeries.abs -Notes: +```python +def abs() -> Self +``` -- The timestamp of the aggregation result is set to the window start timestamp. -- Every window is grouped by the current Kafka message key. -- Messages with `None` key will be ignored. -- The time windows always use the current event time. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/series.py#L449) +Get absolute value of the series value. Example Snippet: ```python -app = Application() -sdf = app.dataframe(...) - -sdf = ( - # Define a hopping window of 60s with step 30s and grace period of 10s - sdf.hopping_window( - duration_ms=timedelta(seconds=60), - step_ms=timedelta(seconds=30), - grace_ms=timedelta(seconds=10) - ) +from quixstreams import Application - # Specify the aggregation function - .sum() +# Get absolute value of "int_col" and add it to "other_int_col". +# Finally, assign the result to a new column: "abs_col_sum". - # Specify how the results should be emitted downstream. - # "current()" will emit results as they come for each updated window, - # possibly producing multiple messages per key-window pair - # "final()" will emit windows only when they are closed and cannot - # receive any updates anymore. - .current() -) +sdf = app.dataframe() +sdf["abs_col_sum"] = sdf["int_col"].abs() + sdf["other_int_col"] ``` -**Arguments**: +**Returns**: -- `duration_ms`: The length of each window. It defines the time span for -which each window aggregates data. -Can be specified as either an `int` representing milliseconds -or a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `step_ms`: The step size for the window. -It determines how much each successive window moves forward in time. -Can be specified as either an `int` representing milliseconds -or a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `grace_ms`: The grace period for data arrival. -It allows late-arriving data to be included in the window, -even if it arrives after the window has theoretically moved forward. -Can be specified as either an `int` representing milliseconds -or a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `name`: The unique identifier for the window. If not provided, it will be -automatically generated based on the window's properties. +new StreamingSeries -**Returns**: + -`HoppingWindowDefinition` instance representing the hopping -window configuration. -This object can be further configured with aggregation functions -like `sum`, `count`, etc. and applied to the StreamingDataFrame. +## quixstreams.dataframe.dataframe - + -#### StreamingDataFrame.sliding\_window +### StreamingDataFrame ```python -def sliding_window(duration_ms: Union[int, timedelta], - grace_ms: Union[int, timedelta] = 0, - name: Optional[str] = None) -> SlidingWindowDefinition +class StreamingDataFrame(BaseStreaming) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L957) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L68) -Create a sliding window transformation on this StreamingDataFrame. +`StreamingDataFrame` is the main object you will use for ETL work. -Sliding windows continuously evaluate the stream with a fixed step of 1 ms -allowing for overlapping, but not redundant windows of a fixed size. +Typically created with an `app = quixstreams.app.Application()` instance, +via `sdf = app.dataframe()`. -Sliding windows are similar to hopping windows with step_ms set to 1, -but are siginificantly more perforant. -They allow performing stateful aggregations like `sum`, `reduce`, etc. -on top of the data and emit results downstream. +What it Does: -Notes: +- Builds a data processing pipeline, declaratively (not executed immediately) + - Executes this pipeline on inputs at runtime (Kafka message values) +- Provides functions/interface similar to Pandas Dataframes/Series +- Enables stateful processing (and manages everything related to it) -- The timestamp of the aggregation result is set to the window start timestamp. -- Every window is grouped by the current Kafka message key. -- Messages with `None` key will be ignored. -- The time windows always use the current event time. -- Windows are inclusive on both the start end end time. -- Every window contains a distinct aggregation. -Example Snippet: +How to Use: -```python -app = Application() -sdf = app.dataframe(...) +Define various operations while continuously reassigning to itself (or new fields). -sdf = ( - # Define a sliding window of 60s with a grace period of 10s - sdf.sliding_window( - duration_ms=timedelta(seconds=60), - grace_ms=timedelta(seconds=10) - ) +These operations will generally transform your data, access/update state, or produce +to kafka topics. - # Specify the aggregation function - .sum() +We recommend your data structure to be "columnar" (aka a dict/JSON) in nature so +that it works with the entire interface, but simple types like `ints`, `str`, etc. +are also supported. - # Specify how the results should be emitted downstream. - # "current()" will emit results as they come for each updated window, - # possibly producing multiple messages per key-window pair - # "final()" will emit windows only when they are closed and cannot - # receive any updates anymore. - .current() -) -``` +See the various methods and classes for more specifics, or for a deep dive into +usage, see `streamingdataframe.md` under the `docs/` folder. -**Arguments**: +>***NOTE:*** column referencing like `sdf["a_column"]` and various methods often + create other object types (typically `quixstreams.dataframe.StreamingSeries`), + which is expected; type hinting should alert you to any issues should you + attempt invalid operations with said objects (however, we cannot infer whether + an operation is valid with respect to your data!). -- `duration_ms`: The length of each window. -Can be specified as either an `int` representing milliseconds or a -`timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `grace_ms`: The grace period for data arrival. -It allows late-arriving data (data arriving after the window -has theoretically closed) to be included in the window. -Can be specified as either an `int` representing milliseconds -or as a `timedelta` object. ->***NOTE:*** `timedelta` objects will be rounded to the closest millisecond -value. -- `name`: The unique identifier for the window. If not provided, it will be -automatically generated based on the window's properties. -**Returns**: +Example Snippet: -`SlidingWindowDefinition` instance representing the sliding window -configuration. -This object can be further configured with aggregation functions -like `sum`, `count`, etc. applied to the StreamingDataFrame. +```python +sdf = StreamingDataFrame() +sdf = sdf.apply(a_func) +sdf = sdf.filter(another_func) +sdf = sdf.to_topic(topic_obj) +``` - + -#### StreamingDataFrame.drop +#### StreamingDataFrame.apply ```python -def drop(columns: Union[str, List[str]], - errors: Literal["ignore", "raise"] = "raise") -> Self +def apply(func: Union[ + ApplyCallback, + ApplyCallbackStateful, + ApplyWithMetadataCallback, + ApplyWithMetadataCallbackStateful, +], + *, + stateful: bool = False, + expand: bool = False, + metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L1038) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L174) -Drop column(s) from the message value (value must support `del`, like a dict). +Apply a function to transform the value and return a new value. -This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the -original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). +The result will be passed downstream as an input value. Example Snippet: ```python -# Remove columns "x" and "y" from the value. -# This would transform {"x": 1, "y": 2, "z": 3} to {"z": 3} +# This stores a string in state and capitalizes every column with a string value. +# A second apply then keeps only the string value columns (shows non-stateful). +def func(d: dict, state: State): + value = d["store_field"] + if value != state.get("my_store_key"): + state.set("my_store_key") = value + return {k: v.upper() if isinstance(v, str) else v for k, v in d.items()} sdf = StreamingDataFrame() -sdf.drop(["x", "y"]) +sdf = sdf.apply(func, stateful=True) +sdf = sdf.apply(lambda d: {k: v for k,v in d.items() if isinstance(v, str)}) + ``` **Arguments**: -- `columns`: a single column name or a list of names, where names are `str` -- `errors`: If "ignore", suppress error and only existing labels are dropped. -Default - `"raise"`. - -**Returns**: - -a new StreamingDataFrame instance +- `func`: a function to apply +- `stateful`: if `True`, the function will be provided with a second argument +of type `State` to perform stateful operations. +- `expand`: if True, expand the returned iterable into individual values +downstream. If returned value is not iterable, `TypeError` will be raised. +Default - `False`. +- `metadata`: if True, the callback will receive key, timestamp and headers +along with the value. +Default - `False`. - + -#### StreamingDataFrame.sink +#### StreamingDataFrame.update ```python -def sink(sink: BaseSink) +def update(func: Union[ + UpdateCallback, + UpdateCallbackStateful, + UpdateWithMetadataCallback, + UpdateWithMetadataCallbackStateful, +], + *, + stateful: bool = False, + metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/dataframe.py#L1082) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L263) -Sink the processed data to the specified destination. +Apply a function to mutate value in-place or to perform a side effect -Internally, each processed record is added to a sink, and the sinks are -flushed on each checkpoint. -The offset will be committed only if all the sinks for all topic partitions -are flushed successfully. +(e.g., printing a value to the console). -Additionally, Sinks may signal the backpressure to the application -(e.g., when the destination is rate-limited). -When this happens, the application will pause the corresponding topic partition -and resume again after the timeout. -The backpressure handling and timeouts are defined by the specific sinks. +The result of the function will be ignored, and the original value will be +passed downstream. -Note: `sink()` is a terminal operation - it cannot receive any additional -operations, but branches can still be generated from its originating SDF. +This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the +original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). - -## quixstreams.dataframe.series +Example Snippet: - +```python +# Stores a value and mutates a list by appending a new item to it. +# Also prints to console. -### StreamingSeries +def func(values: list, state: State): + value = values[0] + if value != state.get("my_store_key"): + state.set("my_store_key") = value + values.append("new_item") -```python -class StreamingSeries(BaseStreaming) +sdf = StreamingDataFrame() +sdf = sdf.update(func, stateful=True) +# does not require reassigning +sdf.update(lambda v: v.append(1)) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L70) +**Arguments**: -`StreamingSeries` are typically generated by `StreamingDataframes` when getting -elements from, or performing certain operations on, a `StreamingDataframe`, -thus acting as a representation of "column" value. +- `func`: function to update value +- `stateful`: if `True`, the function will be provided with a second argument +of type `State` to perform stateful operations. +- `metadata`: if True, the callback will receive key, timestamp and headers +along with the value. +Default - `False`. -They share some operations with the `StreamingDataframe`, but also provide some -additional functionality. +**Returns**: -Most column value operations are handled by this class, and `StreamingSeries` can -generate other `StreamingSeries` as a result of said operations. +the updated StreamingDataFrame instance (reassignment NOT required). + -What it Does: +#### StreamingDataFrame.filter -- Allows ways to do simple operations with dataframe "column"/dictionary values: - - Basic ops like add, subtract, modulo, etc. -- Enables comparisons/inequalities: - - Greater than, equals, etc. - - and/or, is/not operations -- Can check for existence of columns in `StreamingDataFrames` -- Enables chaining of various operations together - - -How to Use: - -For the most part, you may not even notice this class exists! -They will naturally be created as a result of typical `StreamingDataFrame` use. +```python +def filter(func: Union[ + FilterCallback, + FilterCallbackStateful, + FilterWithMetadataCallback, + FilterWithMetadataCallbackStateful, +], + *, + stateful: bool = False, + metadata: bool = False) -> Self +``` -Auto-complete should help you with valid methods and type-checking should alert -you to invalid operations between `StreamingSeries`. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L355) -In general, any typical Pands dataframe operation between columns should be valid -with `StreamingSeries`, and you shouldn't have to think about them explicitly. +Filter value using provided function. +If the function returns True-like value, the original value will be +passed downstream. Example Snippet: ```python -# Random methods for example purposes. More detailed explanations found under -# various methods or in the docs folder. - -sdf = StreamingDataFrame() -sdf = sdf["column_a"].apply(a_func).apply(diff_func, stateful=True) -sdf["my_new_bool_field"] = sdf["column_b"].contains("this_string") -sdf["new_sum_field"] = sdf["column_c"] + sdf["column_d"] + 2 -sdf = sdf[["column_a"] & (sdf["new_sum_field"] >= 10)] -``` - - +# Stores a value and allows further processing only if the value is greater than +# what was previously stored. -#### StreamingSeries.from\_apply\_callback +def func(d: dict, state: State): + value = d["my_value"] + if value > state.get("my_store_key"): + state.set("my_store_key") = value + return True + return False -```python -@classmethod -def from_apply_callback(cls, func: ApplyWithMetadataCallback, - sdf_id: int) -> Self +sdf = StreamingDataFrame() +sdf = sdf.filter(func, stateful=True) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L132) - -Create a StreamingSeries from a function. - -The provided function will be wrapped into `Apply` - **Arguments**: -- `func`: a function to apply -- `sdf_id`: the id of the calling `SDF`. - -**Returns**: - -instance of `StreamingSeries` +- `func`: function to filter value +- `stateful`: if `True`, the function will be provided with second argument +of type `State` to perform stateful operations. +- `metadata`: if True, the callback will receive key, timestamp and headers +along with the value. +Default - `False`. - + -#### StreamingSeries.apply +#### StreamingDataFrame.group\_by ```python -def apply(func: ApplyCallback) -> Self +def group_by(key: Union[str, Callable[[Any], Any]], + name: Optional[str] = None, + value_deserializer: Optional[DeserializerType] = "json", + key_deserializer: Optional[DeserializerType] = "json", + value_serializer: Optional[SerializerType] = "json", + key_serializer: Optional[SerializerType] = "json") -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L155) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L441) -Add a callable to the execution list for this series. +"Groups" messages by re-keying them via the provided group_by operation -The provided callable should accept a single argument, which will be its input. -The provided callable should similarly return one output, or None +on their message values. -They can be chained together or included with other operations. +This enables things like aggregations on messages with non-matching keys. + +You can provide a column name (uses the column's value) or a custom function +to generate this new key. + +`.groupby()` can only be performed once per `StreamingDataFrame` instance. +>**NOTE:** group_by generates a topic that copies the original topic's settings. Example Snippet: ```python -# The `StreamingSeries` are generated when `sdf["COLUMN_NAME"]` is called. -# This stores a string in state and capitalizes the column value; the result is -# assigned to a new column. -# Another apply converts a str column to an int, assigning it to a new column. +# We have customer purchase events where the message key is the "store_id", +# but we want to calculate sales per customer (by "customer_account_id"). -def func(value: str, state: State): - if value != state.get("my_store_key"): - state.set("my_store_key") = value - return v.upper() +def func(d: dict, state: State): + current_total = state.get("customer_sum", 0) + new_total = current_total + d["customer_spent"] + state.set("customer_sum", new_total) + d["customer_total"] = new_total + return d sdf = StreamingDataFrame() -sdf["new_col"] = sdf["a_column"]["nested_dict_key"].apply(func, stateful=True) -sdf["new_col_2"] = sdf["str_col"].apply(lambda v: int(v)) + sdf["str_col2"] + 2 +sdf = sdf.group_by("customer_account_id") +sdf = sdf.apply(func, stateful=True) ``` **Arguments**: -- `func`: a callable with one argument and one output +- `key`: how the new key should be generated from the message value; +requires a column name (string) or a callable that takes the message value. +- `name`: a name for the op (must be unique per group-by), required if `key` +is a custom callable. +- `value_deserializer`: a deserializer type for values; default - JSON +- `key_deserializer`: a deserializer type for keys; default - JSON +- `value_serializer`: a serializer type for values; default - JSON +- `key_serializer`: a serializer type for keys; default - JSON **Returns**: -a new `StreamingSeries` with the new callable added +a clone with this operation added (assign to keep its effect). - + -#### StreamingSeries.compose\_returning +#### StreamingDataFrame.contains ```python -def compose_returning() -> ReturningExecutor +def contains(key: str) -> StreamingSeries ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L189) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L514) -Compose a list of functions from this StreamingSeries and its parents into one +Check if the key is present in the Row value. -big closure that always returns the transformed record. +Example Snippet: -This closure is to be used to execute the functions in the stream and to get -the result of the transformations. +```python +# Add new column 'has_column' which contains a boolean indicating +# the presence of 'column_x' -Stream may only contain simple "apply" functions to be able to compose itself -into a returning function. +sdf = StreamingDataFrame() +sdf['has_column'] = sdf.contains('column_x') +``` + +**Arguments**: + +- `key`: a column name to check. **Returns**: -a callable accepting value, key and timestamp and -returning a tuple "(value, key, timestamp) +a Column object that evaluates to True if the key is present +or False otherwise. - + -#### StreamingSeries.compose +#### StreamingDataFrame.to\_topic ```python -def compose( - sink: Optional[Callable[[Any, Any, int, Any], - None]] = None) -> VoidExecutor +def to_topic(topic: Topic, key: Optional[Callable[[Any], Any]] = None) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L204) - -Compose all functions of this StreamingSeries into one big closure. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L539) -Generally not required by users; the `quixstreams.app.Application` class will -do this automatically. +Produce current value to a topic. You can optionally specify a new key. +This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the +original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). Example Snippet: ```python from quixstreams import Application -app = Application(...) +# Produce to two different topics, changing the key for one of them. -sdf = app.dataframe() -sdf = sdf["column_a"].apply(apply_func) -sdf = sdf["column_b"].contains(filter_func) -sdf = sdf.compose() +app = Application() +input_topic = app.topic("input_x") +output_topic_0 = app.topic("output_a") +output_topic_1 = app.topic("output_b") -result_0 = sdf({"my": "record"}) -result_1 = sdf({"other": "record"}) +sdf = app.dataframe(input_topic) +sdf = sdf.to_topic(output_topic_0) +# does not require reassigning +sdf.to_topic(output_topic_1, key=lambda data: data["a_field"]) ``` **Arguments**: -- `sink`: callable to accumulate the results of the execution. - -**Raises**: - -- `ValueError`: if disallowed functions are present in the tree of -underlying `Stream`. +- `topic`: instance of `Topic` +- `key`: a callable to generate a new message key, optional. +If passed, the return type of this callable must be serializable +by `key_serializer` defined for this Topic object. +By default, the current message key will be used. **Returns**: -a callable accepting value, key and timestamp and -returning None +the updated StreamingDataFrame instance (reassignment NOT required). - + -#### StreamingSeries.test +#### StreamingDataFrame.set\_timestamp ```python -def test(value: Any, - key: Any, - timestamp: int, - headers: Optional[Any] = None, - ctx: Optional[MessageContext] = None) -> Any +def set_timestamp(func: Callable[[Any, Any, int, Any], int]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L248) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L584) -A shorthand to test `StreamingSeries` with provided value +Set a new timestamp based on the current message value and its metadata. -and `MessageContext`. +The new timestamp will be used in windowed aggregations and when producing +messages to the output topics. + +The new timestamp must be in milliseconds to conform Kafka requirements. + +Example Snippet: + +```python +from quixstreams import Application + + +app = Application() +input_topic = app.topic("data") + +sdf = app.dataframe(input_topic) +# Updating the record's timestamp based on the value +sdf = sdf.set_timestamp(lambda value, key, timestamp, headers: value['new_timestamp']) +``` **Arguments**: -- `value`: value to pass through `StreamingSeries` -- `ctx`: instance of `MessageContext`, optional. -Provide it if the StreamingSeries instance has -functions calling `get_current_key()`. -Default - `None`. +- `func`: callable accepting the current value, key, timestamp, and headers. +It's expected to return a new timestamp as integer in milliseconds. **Returns**: -result of `StreamingSeries` +a new StreamingDataFrame instance - + -#### StreamingSeries.isin +#### StreamingDataFrame.set\_headers ```python -def isin(other: Container) -> Self +def set_headers( + func: Callable[ + [Any, Any, int, HeadersTuples], + HeadersTuples, + ]) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L304) - -Check if series value is in "other". +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L625) -Same as "StreamingSeries in other". +Set new message headers based on the current message value and metadata. -Runtime result will be a `bool`. +The new headers will be used when producing messages to the output topics. +The provided callback must accept value, key, timestamp, and headers, +and return a new collection of (header, value) tuples. Example Snippet: ```python from quixstreams import Application -# Check if "str_column" is contained in a column with a list of strings and -# assign the resulting `bool` to a new column: "has_my_str". -sdf = app.dataframe() -sdf["has_my_str"] = sdf["str_column"].isin(sdf["column_with_list_of_strs"]) +app = Application() +input_topic = app.topic("data") + +sdf = app.dataframe(input_topic) +# Updating the record's headers based on the value and metadata +sdf = sdf.set_headers(lambda value, key, timestamp, headers: [('id', value['id'])]) ``` **Arguments**: -- `other`: a container to check +- `func`: callable accepting the current value, key, timestamp, and headers. +It's expected to return a new set of headers +as a collection of (header, value) tuples. **Returns**: -new StreamingSeries +a new StreamingDataFrame instance - + -#### StreamingSeries.contains +#### StreamingDataFrame.print ```python -def contains(other: Union[Self, object]) -> Self +def print(pretty: bool = True, metadata: bool = False) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L331) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L676) -Check if series value contains "other" +Print out the current message value (and optionally, the message metadata) to -Same as "other in StreamingSeries". +stdout (console) (like the built-in `print` function). -Runtime result will be a `bool`. +Can also output a more dict-friendly format with `pretty=True`. + +This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the +original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). +> NOTE: prints the current (edited) values, not the original values. Example Snippet: ```python from quixstreams import Application -# Check if "column_a" contains "my_substring" and assign the resulting -# `bool` to a new column: "has_my_substr" -sdf = app.dataframe() -sdf["has_my_substr"] = sdf["column_a"].contains("my_substring") +app = Application() +input_topic = app.topic("data") + +sdf = app.dataframe(input_topic) +sdf["edited_col"] = sdf["orig_col"] + "edited" +# print the updated message value with the newly added column +sdf.print() ``` **Arguments**: -- `other`: object to check +- `pretty`: Whether to use "pprint" formatting, which uses new-lines and +indents for easier console reading (but might be worse for log parsing). +- `metadata`: Whether to additionally print the key, timestamp, and headers **Returns**: -new StreamingSeries +the updated StreamingDataFrame instance (reassignment NOT required). - + -#### StreamingSeries.is\_ +#### StreamingDataFrame.compose ```python -def is_(other: Union[Self, object]) -> Self +def compose( + sink: Optional[Callable[[Any, Any, int, Any], None]] = None +) -> Dict[str, VoidExecutor] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L356) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L718) -Check if series value refers to the same object as `other` +Compose all functions of this StreamingDataFrame into one big closure. -Runtime result will be a `bool`. +Closures are more performant than calling all the functions in the +`StreamingDataFrame` one-by-one. + +Generally not required by users; the `quixstreams.app.Application` class will +do this automatically. Example Snippet: ```python -# Check if "column_a" is the same as "column_b" and assign the resulting `bool` -# to a new column: "is_same" - from quixstreams import Application sdf = app.dataframe() -sdf["is_same"] = sdf["column_a"].is_(sdf["column_b"]) +sdf = sdf.apply(apply_func) +sdf = sdf.filter(filter_func) +sdf = sdf.compose() + +result_0 = sdf({"my": "record"}) +result_1 = sdf({"other": "record"}) ``` **Arguments**: -- `other`: object to check for "is" +- `sink`: callable to accumulate the results of the execution, optional. **Returns**: -new StreamingSeries +a function that accepts "value" +and returns a result of StreamingDataFrame - + -#### StreamingSeries.isnot +#### StreamingDataFrame.test ```python -def isnot(other: Union[Self, object]) -> Self +def test(value: Any, + key: Any, + timestamp: int, + headers: Optional[Any] = None, + ctx: Optional[MessageContext] = None, + topic: Optional[Topic] = None) -> List[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L379) - -Check if series value does not refer to the same object as `other` - -Runtime result will be a `bool`. - - -Example Snippet: - -```python -from quixstreams import Application +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L752) -# Check if "column_a" is the same as "column_b" and assign the resulting `bool` -# to a new column: "is_not_same" +A shorthand to test `StreamingDataFrame` with provided value -sdf = app.dataframe() -sdf["is_not_same"] = sdf["column_a"].isnot(sdf["column_b"]) -``` +and `MessageContext`. **Arguments**: -- `other`: object to check for "is_not" +- `value`: value to pass through `StreamingDataFrame` +- `key`: key to pass through `StreamingDataFrame` +- `timestamp`: timestamp to pass through `StreamingDataFrame` +- `ctx`: instance of `MessageContext`, optional. +Provide it if the StreamingDataFrame instance calls `to_topic()`, +has stateful functions or windows. +Default - `None`. +- `topic`: optionally, a topic branch to test with **Returns**: -new StreamingSeries +result of `StreamingDataFrame` - + -#### StreamingSeries.isnull +#### StreamingDataFrame.tumbling\_window ```python -def isnull() -> Self +def tumbling_window(duration_ms: Union[int, timedelta], + grace_ms: Union[int, timedelta] = 0, + name: Optional[str] = None) -> TumblingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L403) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L789) -Check if series value is None. +Create a tumbling window transformation on this StreamingDataFrame. -Runtime result will be a `bool`. +Tumbling windows divide time into fixed-sized, non-overlapping windows. +They allow performing stateful aggregations like `sum`, `reduce`, etc. +on top of the data and emit results downstream. -Example Snippet: +Notes: -```python -from quixstreams import Application +- The timestamp of the aggregation result is set to the window start timestamp. +- Every window is grouped by the current Kafka message key. +- Messages with `None` key will be ignored. +- The time windows always use the current event time. -# Check if "column_a" is null and assign the resulting `bool` to a new column: -# "is_null" -sdf = app.dataframe() -sdf["is_null"] = sdf["column_a"].isnull() -``` -**Returns**: +Example Snippet: -new StreamingSeries +```python +app = Application() +sdf = app.dataframe(...) - +sdf = ( + # Define a tumbling window of 60s and grace period of 10s + sdf.tumbling_window( + duration_ms=timedelta(seconds=60), grace_ms=timedelta(seconds=10.0) + ) -#### StreamingSeries.notnull + # Specify the aggregation function + .sum() -```python -def notnull() -> Self + # Specify how the results should be emitted downstream. + # "current()" will emit results as they come for each updated window, + # possibly producing multiple messages per key-window pair + # "final()" will emit windows only when they are closed and cannot + # receive any updates anymore. + .current() +) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L426) - -Check if series value is not None. +**Arguments**: -Runtime result will be a `bool`. +- `duration_ms`: The length of each window. +Can be specified as either an `int` representing milliseconds or a +`timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `grace_ms`: The grace period for data arrival. +It allows late-arriving data (data arriving after the window +has theoretically closed) to be included in the window. +Can be specified as either an `int` representing milliseconds +or as a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `name`: The unique identifier for the window. If not provided, it will be +automatically generated based on the window's properties. +**Returns**: -Example Snippet: +`TumblingWindowDefinition` instance representing the tumbling window +configuration. +This object can be further configured with aggregation functions +like `sum`, `count`, etc. applied to the StreamingDataFrame. -```python -from quixstreams import Application + -# Check if "column_a" is not null and assign the resulting `bool` to a new column: -# "is_not_null" +#### StreamingDataFrame.hopping\_window -sdf = app.dataframe() -sdf["is_not_null"] = sdf["column_a"].notnull() +```python +def hopping_window(duration_ms: Union[int, timedelta], + step_ms: Union[int, timedelta], + grace_ms: Union[int, timedelta] = 0, + name: Optional[str] = None) -> HoppingWindowDefinition ``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L865) -new StreamingSeries +Create a hopping window transformation on this StreamingDataFrame. - +Hopping windows divide the data stream into overlapping windows based on time. +The overlap is controlled by the `step_ms` parameter. -#### StreamingSeries.abs +They allow performing stateful aggregations like `sum`, `reduce`, etc. +on top of the data and emit results downstream. -```python -def abs() -> Self -``` +Notes: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/series.py#L449) +- The timestamp of the aggregation result is set to the window start timestamp. +- Every window is grouped by the current Kafka message key. +- Messages with `None` key will be ignored. +- The time windows always use the current event time. -Get absolute value of the series value. Example Snippet: ```python -from quixstreams import Application - -# Get absolute value of "int_col" and add it to "other_int_col". -# Finally, assign the result to a new column: "abs_col_sum". - -sdf = app.dataframe() -sdf["abs_col_sum"] = sdf["int_col"].abs() + sdf["other_int_col"] -``` +app = Application() +sdf = app.dataframe(...) -**Returns**: +sdf = ( + # Define a hopping window of 60s with step 30s and grace period of 10s + sdf.hopping_window( + duration_ms=timedelta(seconds=60), + step_ms=timedelta(seconds=30), + grace_ms=timedelta(seconds=10) + ) -new StreamingSeries + # Specify the aggregation function + .sum() - + # Specify how the results should be emitted downstream. + # "current()" will emit results as they come for each updated window, + # possibly producing multiple messages per key-window pair + # "final()" will emit windows only when they are closed and cannot + # receive any updates anymore. + .current() +) +``` -## quixstreams.dataframe +**Arguments**: - +- `duration_ms`: The length of each window. It defines the time span for +which each window aggregates data. +Can be specified as either an `int` representing milliseconds +or a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `step_ms`: The step size for the window. +It determines how much each successive window moves forward in time. +Can be specified as either an `int` representing milliseconds +or a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `grace_ms`: The grace period for data arrival. +It allows late-arriving data to be included in the window, +even if it arrives after the window has theoretically moved forward. +Can be specified as either an `int` representing milliseconds +or a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `name`: The unique identifier for the window. If not provided, it will be +automatically generated based on the window's properties. -## quixstreams.dataframe.utils +**Returns**: - +`HoppingWindowDefinition` instance representing the hopping +window configuration. +This object can be further configured with aggregation functions +like `sum`, `count`, etc. and applied to the StreamingDataFrame. -#### ensure\_milliseconds + + +#### StreamingDataFrame.sliding\_window ```python -def ensure_milliseconds(delta: Union[int, timedelta]) -> int +def sliding_window(duration_ms: Union[int, timedelta], + grace_ms: Union[int, timedelta] = 0, + name: Optional[str] = None) -> SlidingWindowDefinition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/utils.py#L5) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L957) -Convert timedelta to milliseconds. +Create a sliding window transformation on this StreamingDataFrame. -If the `delta` is not -This function will also round the value to the closest milliseconds in case of -higher precision. +Sliding windows continuously evaluate the stream with a fixed step of 1 ms +allowing for overlapping, but not redundant windows of a fixed size. -**Arguments**: +Sliding windows are similar to hopping windows with step_ms set to 1, +but are siginificantly more perforant. -- `delta`: `timedelta` object +They allow performing stateful aggregations like `sum`, `reduce`, etc. +on top of the data and emit results downstream. -**Returns**: +Notes: -timedelta value in milliseconds as `int` +- The timestamp of the aggregation result is set to the window start timestamp. +- Every window is grouped by the current Kafka message key. +- Messages with `None` key will be ignored. +- The time windows always use the current event time. +- Windows are inclusive on both the start end end time. +- Every window contains a distinct aggregation. - +Example Snippet: -## quixstreams.dataframe.exceptions +```python +app = Application() +sdf = app.dataframe(...) - +sdf = ( + # Define a sliding window of 60s with a grace period of 10s + sdf.sliding_window( + duration_ms=timedelta(seconds=60), + grace_ms=timedelta(seconds=10) + ) -## quixstreams.dataframe.windows.sliding + # Specify the aggregation function + .sum() - + # Specify how the results should be emitted downstream. + # "current()" will emit results as they come for each updated window, + # possibly producing multiple messages per key-window pair + # "final()" will emit windows only when they are closed and cannot + # receive any updates anymore. + .current() +) +``` -### SlidingWindow +**Arguments**: -```python -class SlidingWindow(FixedTimeWindow) -``` +- `duration_ms`: The length of each window. +Can be specified as either an `int` representing milliseconds or a +`timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `grace_ms`: The grace period for data arrival. +It allows late-arriving data (data arriving after the window +has theoretically closed) to be included in the window. +Can be specified as either an `int` representing milliseconds +or as a `timedelta` object. +>***NOTE:*** `timedelta` objects will be rounded to the closest millisecond +value. +- `name`: The unique identifier for the window. If not provided, it will be +automatically generated based on the window's properties. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/sliding.py#L9) +**Returns**: - +`SlidingWindowDefinition` instance representing the sliding window +configuration. +This object can be further configured with aggregation functions +like `sum`, `count`, etc. applied to the StreamingDataFrame. -#### SlidingWindow.process\_window + + +#### StreamingDataFrame.drop ```python -def process_window( - value: Any, timestamp_ms: int, state: WindowedState -) -> tuple[Iterable[WindowResult], Iterable[WindowResult]] +def drop(columns: Union[str, List[str]], + errors: Literal["ignore", "raise"] = "raise") -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/sliding.py#L10) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L1038) -The algorithm is based on the concept that each message -is associated with a left and a right window. +Drop column(s) from the message value (value must support `del`, like a dict). -Left Window: -- Begins at message timestamp - window size -- Ends at message timestamp +This operation occurs in-place, meaning reassignment is entirely OPTIONAL: the +original `StreamingDataFrame` is returned for chaining (`sdf.update().print()`). -Right Window: -- Begins at message timestamp + 1 ms -- Ends at message timestamp + 1 ms + window size -For example, for a window size of 10 and a message A arriving at timestamp 26: +Example Snippet: - 0 10 20 30 40 50 60 -----|---------|---------|---------|---------|---------|---------|---> - A -left window -> |---------||---------| <- right window - 16 26 27 37 +```python +# Remove columns "x" and "y" from the value. +# This would transform {"x": 1, "y": 2, "z": 3} to {"z": 3} -The algorithm scans backward through the window store: -- Starting at: start_time = message timestamp + 1 ms (the right window's start time) -- Ending at: start_time = message timestamp - 2 * window size +sdf = StreamingDataFrame() +sdf.drop(["x", "y"]) +``` -During this traversal, the algorithm performs the following actions: +**Arguments**: -1. Determine if the right window should be created. - If yes, locate the existing aggregation to copy to the new window. -2. Determine if the right window of the previous record should be created. - If yes, locate the existing aggregation and combine it with the incoming message. -3. Locate and update the left window if it exists. -4. If the left window does not exist, create it. Locate the existing - aggregation and combine it with the incoming message. -5. Locate and update all existing windows to which the new message belongs. +- `columns`: a single column name or a list of names, where names are `str` +- `errors`: If "ignore", suppress error and only existing labels are dropped. +Default - `"raise"`. - +**Returns**: -## quixstreams.dataframe.windows.definitions +a new StreamingDataFrame instance - + -### FixedTimeWindowDefinition +#### StreamingDataFrame.sink ```python -class FixedTimeWindowDefinition(abc.ABC) +def sink(sink: BaseSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/dataframe/dataframe.py#L1082) - +Sink the processed data to the specified destination. -#### FixedTimeWindowDefinition.sum +Internally, each processed record is added to a sink, and the sinks are +flushed on each checkpoint. +The offset will be committed only if all the sinks for all topic partitions +are flushed successfully. -```python -def sum() -> "FixedTimeWindow" -``` +Additionally, Sinks may signal the backpressure to the application +(e.g., when the destination is rate-limited). +When this happens, the application will pause the corresponding topic partition +and resume again after the timeout. +The backpressure handling and timeouts are defined by the specific sinks. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L66) +Note: `sink()` is a terminal operation - it cannot receive any additional +operations, but branches can still be generated from its originating SDF. -Configure the window to aggregate data by summing up values within + -each window period. +## quixstreams.exceptions.base -**Returns**: + -an instance of `FixedTimeWindow` configured to perform sum aggregation. +## quixstreams.exceptions.assignment - + -#### FixedTimeWindowDefinition.count +### PartitionAssignmentError ```python -def count() -> "FixedTimeWindow" +class PartitionAssignmentError(QuixException) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L81) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/exceptions/assignment.py#L6) -Configure the window to aggregate data by counting the number of values +Error happened during partition rebalancing. +Raised from `on_assign`, `on_revoke` and `on_lost` callbacks -within each window period. + -**Returns**: +## quixstreams.exceptions -an instance of `FixedTimeWindow` configured to perform record count. + - +## quixstreams.kafka.exceptions -#### FixedTimeWindowDefinition.mean + + +## quixstreams.kafka + + + +## quixstreams.kafka.configuration + + + +### ConnectionConfig ```python -def mean() -> "FixedTimeWindow" +class ConnectionConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L96) - -Configure the window to aggregate data by calculating the mean of the values +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/configuration.py#L21) -within each window period. +Provides an interface for all librdkafka connection-based configs. -**Returns**: +Allows converting to or from a librdkafka dictionary. -an instance of `FixedTimeWindow` configured to calculate the mean -of the values. +Also obscures secrets and handles any case sensitivity issues. - + -#### FixedTimeWindowDefinition.reduce +#### ConnectionConfig.settings\_customise\_sources ```python -def reduce(reducer: Callable[[Any, Any], Any], - initializer: Callable[[Any], Any]) -> "FixedTimeWindow" +@classmethod +def settings_customise_sources( + cls, settings_cls: Type[PydanticBaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + dotenv_settings: PydanticBaseSettingsSource, + file_secret_settings: PydanticBaseSettingsSource +) -> Tuple[PydanticBaseSettingsSource, ...] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L116) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/configuration.py#L99) -Configure the window to perform a custom aggregation using `reducer` +Included to ignore reading/setting values from the environment -and `initializer` functions. + -Example Snippet: -```python -sdf = StreamingDataFrame(...) +#### ConnectionConfig.from\_librdkafka\_dict -# Using "reduce()" to calculate multiple aggregates at once -def reducer(agg: dict, current: int): - aggregated = { - 'min': min(agg['min'], current), - 'max': max(agg['max'], current), - 'count': agg['count'] + 1 - } - return aggregated +```python +@classmethod +def from_librdkafka_dict(cls, + config: dict, + ignore_extras: bool = False) -> Self +``` -def initializer(current) -> dict: - return {'min': current, 'max': current, 'count': 1} +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/configuration.py#L113) -window = ( - sdf.tumbling_window(duration_ms=1000) - .reduce(reducer=reducer, initializer=initializer) - .final() -) -``` +Create a `ConnectionConfig` from a librdkafka config dictionary. **Arguments**: -- `reducer`: A function that takes two arguments -(the accumulated value and a new value) and returns a single value. -The returned value will be saved to the state store and sent downstream. -- `initializer`: A function to call for every first element of the window. -This function is used to initialize the aggregation within a window. +- `config`: a dict of configs (like {"bootstrap.servers": "url"}) +- `ignore_extras`: Ignore non-connection settings (else raise exception) **Returns**: -A window configured to perform custom reduce aggregation on the data. +a ConnectionConfig - + -#### FixedTimeWindowDefinition.max +#### ConnectionConfig.as\_librdkafka\_dict ```python -def max() -> "FixedTimeWindow" +def as_librdkafka_dict(plaintext_secrets: bool = True) -> dict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L162) - -Configure a window to aggregate the maximum value within each window period. - -**Returns**: - -an instance of `FixedTimeWindow` configured to calculate the maximum -value within each window period. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/configuration.py#L128) -#### FixedTimeWindowDefinition.min +Dump any non-empty config values as a librdkafka dictionary. -```python -def min() -> "FixedTimeWindow" -``` +>***NOTE***: All secret values will be dumped in PLAINTEXT by default. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/definitions.py#L177) +**Arguments**: -Configure a window to aggregate the minimum value within each window period. +- `plaintext_secrets`: whether secret values are plaintext or obscured (***) **Returns**: -an instance of `FixedTimeWindow` configured to calculate the maximum -value within each window period. - - - -## quixstreams.dataframe.windows +a librdkafka-compatible dictionary - + -## quixstreams.dataframe.windows.time\_based +## quixstreams.kafka.consumer - + -### FixedTimeWindow +### BaseConsumer ```python -class FixedTimeWindow() +class BaseConsumer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/time_based.py#L42) - - - -#### FixedTimeWindow.final - -```python -def final() -> "StreamingDataFrame" -``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L68) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/time_based.py#L129) + -Apply the window aggregation and return results only when the windows are -closed. +#### BaseConsumer.\_\_init\_\_ -The format of returned windows: ```python -{ - "start": , - "end": , - "value: , -} +def __init__(broker_address: Union[str, ConnectionConfig], + consumer_group: Optional[str], + auto_offset_reset: AutoOffsetReset, + auto_commit_enable: bool = True, + logger: logging.Logger = logger, + error_callback: Callable[[KafkaError], None] = _default_error_cb, + on_commit: Optional[Callable[ + [Optional[KafkaError], List[TopicPartition]], None]] = None, + extra_config: Optional[dict] = None) ``` -The individual window is closed when the event time -(the maximum observed timestamp across the partition) passes -its end timestamp + grace period. -The closed windows cannot receive updates anymore and are considered final. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L69) ->***NOTE:*** Windows can be closed only within the same message key. -If some message keys appear irregularly in the stream, the latest windows -can remain unprocessed until the message the same key is received. +A wrapper around `confluent_kafka.Consumer`. - +It initializes `confluent_kafka.Consumer` on demand +avoiding network calls during `__init__`, provides typing info for methods +and some reasonable defaults. -#### FixedTimeWindow.current +**Arguments**: -```python -def current() -> "StreamingDataFrame" -``` +- `broker_address`: Connection settings for Kafka. +Accepts string with Kafka broker host and port formatted as `:`, +or a ConnectionConfig object if authentication is required. +- `consumer_group`: Kafka consumer group. +Passed as `group.id` to `confluent_kafka.Consumer` +- `auto_offset_reset`: Consumer `auto.offset.reset` setting. +Available values: +
"earliest" - automatically reset the offset to the smallest offset +
"latest" - automatically reset the offset to the largest offset +
"error" - trigger an error (`ERR__AUTO_OFFSET_RESET`) which is + retrieved by consuming messages (used for testing) +- `auto_commit_enable`: If true, periodically commit offset of +the last message handed to the application. Default - `True`. +- `logger`: a Logger instance to attach librdkafka logging to +- `error_callback`: callback used for consumer errors +- `on_commit`: Offset commit result propagation callback. +Passed as "offset_commit_cb" to `confluent_kafka.Consumer`. +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Consumer` as is. +Note: values passed as arguments override values in `extra_config`. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/time_based.py#L167) + -Apply the window transformation to the StreamingDataFrame to return results -for each updated window. +#### BaseConsumer.poll -The format of returned windows: ```python -{ - "start": , - "end": , - "value: , -} +def poll(timeout: Optional[float] = None) -> Optional[Message] ``` -This method processes streaming data and returns results as they come, -regardless of whether the window is closed or not. - - - -## quixstreams.dataframe.windows.base - - - -#### get\_window\_ranges +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L132) -```python -def get_window_ranges(timestamp_ms: int, - duration_ms: int, - step_ms: Optional[int] = None) -> Deque[Tuple[int, int]] -``` +Consumes a single message, calls callbacks and returns events. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/dataframe/windows/base.py#L17) +The application must check the returned :py:class:`Message` +object's :py:func:`Message.error()` method to distinguish between proper +messages (error() returns None), or an event or error. -Get a list of window ranges for the given timestamp. +Note: a `RebalancingCallback` may be called from this method ( +`on_assign`, `on_revoke`, or `on_lost`). **Arguments**: -- `timestamp_ms`: timestamp in milliseconds -- `duration_ms`: window duration in milliseconds -- `step_ms`: window step in milliseconds for hopping windows, optional. - -**Returns**: - -a list of (, ) tuples +- `timeout` (`float`): Maximum time in seconds to block waiting for message, +event or callback. None or -1 is infinite. Default: None. - +**Raises**: -## quixstreams.dataframe.base +- `RuntimeError`: if called on a closed consumer - +**Returns**: -## quixstreams.rowproducer +`Optional[Message]`: A `Message` object or `None` on timeout - + -### RowProducer +#### BaseConsumer.unsubscribe ```python -class RowProducer() +def unsubscribe() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowproducer.py#L72) - -A producer class that is capable of serializing Rows to bytes and send them to Kafka. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L235) -The serialization is performed according to the Topic serialization settings. +Remove current subscription. -**Arguments**: +**Raises**: -- `broker_address`: Connection settings for Kafka. -Accepts string with Kafka broker host and port formatted as `:`, -or a ConnectionConfig object if authentication is required. -- `extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Producer` as is. -Note: values passed as arguments override values in `extra_config`. -- `on_error`: a callback triggered when `RowProducer.produce_row()` -or `RowProducer.poll()` fail`. -If producer fails and the callback returns `True`, the exception -will be logged but not propagated. -The default callback logs an exception and returns `False`. -- `flush_timeout`: The time the producer is waiting for all messages to be delivered. -- `transactional`: whether to use Kafka transactions or not. -Note this changes which underlying `Producer` class is used. +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer - + -#### RowProducer.produce\_row +#### BaseConsumer.store\_offsets ```python -def produce_row(row: Row, - topic: Topic, - key: Optional[Any] = _KEY_UNSET, - partition: Optional[int] = None, - timestamp: Optional[int] = None) +def store_offsets(message: Optional[Message] = None, + offsets: Optional[List[TopicPartition]] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowproducer.py#L119) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L244) -Serialize Row to bytes according to the Topic serialization settings - -and produce it to Kafka +Store offsets for a message or a list of offsets. -If this method fails, it will trigger the provided "on_error" callback. +`message` and `offsets` are mutually exclusive. The stored offsets +will be committed according to 'auto.commit.interval.ms' or manual +offset-less `commit`. +Note that 'enable.auto.offset.store' must be set to False when using this API. **Arguments**: -- `row`: Row object -- `topic`: Topic object -- `key`: message key, optional -- `partition`: partition number, optional -- `timestamp`: timestamp in milliseconds, optional +- `message` (`confluent_kafka.Message`): Store message's offset+1. +- `offsets` (`List[TopicPartition]`): List of topic+partitions+offsets to store. - +**Raises**: -#### RowProducer.poll +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer + + + +#### BaseConsumer.commit ```python -def poll(timeout: float = 0) +def commit(message: Optional[Message] = None, + offsets: Optional[List[TopicPartition]] = None, + asynchronous: bool = True) -> Optional[List[TopicPartition]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowproducer.py#L159) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L275) -Polls the producer for events and calls `on_delivery` callbacks. +Commit a message or a list of offsets. -If `poll()` fails, it will trigger the provided "on_error" callback +The `message` and `offsets` parameters are mutually exclusive. +If neither is set, the current partition assignment's offsets are used instead. +Use this method to commit offsets if you have 'enable.auto.commit' set to False. **Arguments**: -- `timeout`: timeout in seconds - - - -#### RowProducer.abort\_transaction - -```python -def abort_transaction(timeout: Optional[float] = None) -``` +- `message` (`Message`): Commit the message's offset+1. +Note: By convention, committed offsets reflect the next message +to be consumed, **not** the last message consumed. +- `offsets` (`List[TopicPartition]`): List of topic+partitions+offsets to commit. +- `asynchronous` (`bool`): If true, asynchronously commit, returning None +immediately. If False, the commit() call will block until the commit +succeeds or fails and the committed offsets will be returned (on success). +Note that specific partitions may have failed and the .err field of +each partition should be checked for success. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowproducer.py#L230) +**Raises**: -Attempt an abort if an active transaction. +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer -Else, skip since it throws an exception if at least -one transaction was successfully completed at some point. + -This avoids polluting the stack trace in the case where a transaction was -not active as expected (because of some other exception already raised) -and a cleanup abort is attempted. +#### BaseConsumer.committed -NOTE: under normal circumstances a transaction will be open due to how -the Checkpoint inits another immediately after committing. +```python +def committed(partitions: List[TopicPartition], + timeout: Optional[float] = None) -> List[TopicPartition] +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L316) -## quixstreams.core.stream +Retrieve committed offsets for the specified partitions. - +**Arguments**: -## quixstreams.core.stream.stream +- `partitions` (`List[TopicPartition]`): List of topic+partitions to query for stored offsets. +- `timeout` (`float`): Request timeout (seconds). +None or -1 is infinite. Default: None - +**Raises**: -### Stream +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer -```python -class Stream() -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L38) +`List[TopicPartition]`: List of topic+partitions with offset and possibly error set. - + -#### Stream.\_\_init\_\_ +#### BaseConsumer.get\_watermark\_offsets ```python -def __init__(func: Optional[StreamFunction] = None, - parent: Optional[Self] = None) +def get_watermark_offsets(partition: TopicPartition, + timeout: Optional[float] = None, + cached: bool = False) -> Tuple[int, int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L334) -A base class for all streaming operations. +Retrieve low and high offsets for the specified partition. -`Stream` is an abstraction of a function pipeline. -Each Stream has a function and a parent (None by default). -When adding new function to the stream, it creates a new `Stream` object and -sets "parent" to the previous `Stream` to maintain an order of execution. +**Arguments**: -Streams supports four types of functions: +- `partition` (`TopicPartition`): Topic+partition to return offsets for. +- `timeout` (`float`): Request timeout (seconds). None or -1 is infinite. +Ignored if cached=True. Default: None +- `cached` (`bool`): Instead of querying the broker, use cached information. +Cached values: The low offset is updated periodically +(if statistics.interval.ms is set) while the high offset is updated on each +message fetched from the broker for this partition. -- "Apply" - generate new values based on a previous one. - The result of an Apply function is passed downstream to the next functions. - If "expand=True" is passed and the function returns an `Iterable`, - each item of it will be treated as a separate value downstream. -- "Update" - update values in-place. - The result of an Update function is always ignored, and its input is passed - downstream. -- "Filter" - to filter values from the Stream. - The result of a Filter function is interpreted as boolean. - If it's `True`, the input will be passed downstream. - If it's `False`, the record will be filtered from the stream. -- "Transform" - to transform keys and timestamps along with the values. - "Transform" functions may change the keys and should be used with caution. - The result of the Transform function is passed downstream to the next - functions. - If "expand=True" is passed and the function returns an `Iterable`, - each item of it will be treated as a separate value downstream. +**Raises**: -To execute the functions on the `Stream`, call `.compose()` method, and -it will return a closure to execute all the functions accumulated in the Stream -and its parents. +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer -**Arguments**: +**Returns**: -- `func`: a function to be called on the stream. -It is expected to be wrapped into one of "Apply", "Filter", "Update" or -"Trasform" from `quixstreams.core.stream.functions` package. -Default - "ApplyFunction(lambda value: value)". -- `parent`: a parent `Stream` +`Tuple[int, int]`: Tuple of (low,high) on success or None on timeout. +The high offset is the offset of the last message + 1. - + -#### Stream.add\_filter +#### BaseConsumer.list\_topics ```python -def add_filter(func: Union[FilterCallback, FilterWithMetadataCallback], - *, - metadata: bool = False) -> Self +def list_topics(topic: Optional[str] = None, + timeout: Optional[float] = None) -> ClusterMetadata ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L104) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L360) -Add a function to filter values from the Stream. +Request metadata from the cluster. -The return value of the function will be interpreted as `bool`. -If the function returns `False`-like result, the Stream will raise `Filtered` -exception during execution. +This method provides the same information as +listTopics(), describeTopics() and describeCluster() in the Java Admin client. **Arguments**: -- `func`: a function to filter values from the stream -- `metadata`: if True, the callback will receive key and timestamp along with -the value. -Default - `False`. +- `topic` (`str`): If specified, only request information about this topic, +else return results for all topics in cluster. +Warning: If auto.create.topics.enable is set to true on the broker and +an unknown topic is specified, it will be created. +- `timeout` (`float`): The maximum response time before timing out +None or -1 is infinite. Default: None -**Returns**: +**Raises**: -a new `Stream` derived from the current one +- `KafkaException`: if a Kafka-based error occurs - + -#### Stream.add\_apply +#### BaseConsumer.memberid ```python -def add_apply(func: Union[ - ApplyCallback, - ApplyExpandedCallback, - ApplyWithMetadataCallback, - ApplyWithMetadataExpandedCallback, -], - *, - expand: bool = False, - metadata: bool = False) -> Self +def memberid() -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L129) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L381) -Add an "apply" function to the Stream. +Return this client's broker-assigned group member id. -The function is supposed to return a new value, which will be passed -further during execution. +The member id is assigned by the group coordinator and is propagated to +the consumer during rebalance. -**Arguments**: +**Raises**: -- `func`: a function to generate a new value -- `expand`: if True, expand the returned iterable into individual values -downstream. If returned value is not iterable, `TypeError` will be raised. -Default - `False`. -- `metadata`: if True, the callback will receive key and timestamp along with -the value. -Default - `False`. +- `RuntimeError`: if called on a closed consumer **Returns**: -a new `Stream` derived from the current one +`Optional[string]`: Member id string or None - + -#### Stream.add\_update +#### BaseConsumer.offsets\_for\_times ```python -def add_update(func: Union[UpdateCallback, UpdateWithMetadataCallback], - *, - metadata: bool = False) -> Self +def offsets_for_times(partitions: List[TopicPartition], + timeout: Optional[float] = None) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L162) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L394) -Add an "update" function to the Stream, that will mutate the input value. +Look up offsets by timestamp for the specified partitions. -The return of this function will be ignored and its input -will be passed downstream. +The returned offset for each partition is the earliest offset whose +timestamp is greater than or equal to the given timestamp in the +corresponding partition. If the provided timestamp exceeds that of the +last message in the partition, a value of -1 will be returned. **Arguments**: -- `func`: a function to mutate the value -- `metadata`: if True, the callback will receive key and timestamp along with -the value. -Default - `False`. +- `partitions` (`List[TopicPartition]`): topic+partitions with timestamps +in the TopicPartition.offset field. +- `timeout` (`float`): The maximum response time before timing out. +None or -1 is infinite. Default: None + +**Raises**: + +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer **Returns**: -a new Stream derived from the current one +`List[TopicPartition]`: List of topic+partition with offset field set and possibly error set - + -#### Stream.add\_transform +#### BaseConsumer.pause ```python -def add_transform(func: Union[TransformCallback, TransformExpandedCallback], - *, - expand: bool = False) -> Self +def pause(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L186) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L420) -Add a "transform" function to the Stream, that will mutate the input value. +Pause consumption for the provided list of partitions. -The callback must accept a value, a key, and a timestamp. -It's expected to return a new value, new key and new timestamp. +Paused partitions must be tracked manually. -The result of the callback which will be passed downstream -during execution. +Does NOT affect the result of `Consumer.assignment()`. **Arguments**: -- `func`: a function to mutate the value -- `expand`: if True, expand the returned iterable into individual items -downstream. If returned value is not iterable, `TypeError` will be raised. -Default - `False`. +- `partitions` (`List[TopicPartition]`): List of topic+partitions to pause. -**Returns**: +**Raises**: -a new Stream derived from the current one +- `KafkaException`: if a Kafka-based error occurs - + -#### Stream.diff +#### BaseConsumer.resume ```python -def diff(other: "Stream") -> Self +def resume(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L211) - -Takes the difference between Streams `self` and `other` based on their last - -common parent, and returns a new, independent `Stream` that includes only -this difference (the start of the "diff" will have no parent). +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L433) -It's impossible to calculate a diff when: - - Streams don't have a common parent. - - When the `self` Stream already includes all the nodes from - the `other` Stream, and the resulting diff is empty. +Resume consumption for the provided list of partitions. **Arguments**: -- `other`: a `Stream` to take a diff from. +- `partitions` (`List[TopicPartition]`): List of topic+partitions to resume. **Raises**: -- `ValueError`: if Streams don't have a common parent, -if the diff is empty, or pruning failed. +- `KafkaException`: if a Kafka-based error occurs -**Returns**: + -a new independent `Stream` instance whose root begins at the diff - - - -#### Stream.root\_path +#### BaseConsumer.position ```python -def root_path(allow_splits=True) -> List[Self] +def position(partitions: List[TopicPartition]) -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L272) - -Return a list of all parent Streams including the node itself. - -Can optionally stop at a first encountered split with allow_splits=False - -The tree is ordered from parent to child (current node comes last). - -**Returns**: - -a list of `Stream` objects +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L443) - +Retrieve current positions (offsets) for the specified partitions. -#### Stream.full\_tree +**Arguments**: -```python -def full_tree() -> List[Self] -``` +- `partitions` (`List[TopicPartition]`): List of topic+partitions to return +current offsets for. The current offset is the offset of +the last consumed message + 1. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L293) +**Raises**: -Starts at tree root and finds every Stream in the tree (including splits). +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer **Returns**: -The collection of all Streams interconnected to this one +`List[TopicPartition]`: List of topic+partitions with offset and possibly error set. - + -#### Stream.compose +#### BaseConsumer.seek ```python -def compose( - allow_filters=True, - allow_expands=True, - allow_updates=True, - allow_transforms=True, - sink: Optional[Callable[[Any, Any, int, Any], - None]] = None) -> VoidExecutor +def seek(partition: TopicPartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L300) - -Generate an "executor" closure by mapping all relatives of this `Stream` and +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L457) -composing their functions together. +Set consume position for partition to offset. -The resulting "executor" can be called with a given -value, key, timestamp, and headers (i.e. a Kafka message). +The offset may be an absolute (>=0) or a +logical offset like `OFFSET_BEGINNING`. -By default, executor doesn't return the result of the execution. -To accumulate the results, pass the `sink` parameter. +`seek()` may only be used to update the consume offset of an +actively consumed partition (i.e., after `Consumer.assign()`), +to set the starting offset of partition not being consumed instead +pass the offset in an `assign()` call. **Arguments**: -- `allow_filters`: If False, this function will fail with `ValueError` if -the stream has filter functions in the tree. Default - True. -- `allow_updates`: If False, this function will fail with `ValueError` if -the stream has update functions in the tree. Default - True. -- `allow_expands`: If False, this function will fail with `ValueError` if -the stream has functions with "expand=True" in the tree. Default - True. -- `allow_transforms`: If False, this function will fail with `ValueError` if -the stream has transform functions in the tree. Default - True. -- `sink`: callable to accumulate the results of the execution, optional. +- `partition` (`TopicPartition`): Topic+partition+offset to seek to. - +**Raises**: -#### Stream.compose\_returning +- `KafkaException`: if a Kafka-based error occurs + + + +#### BaseConsumer.assignment ```python -def compose_returning() -> ReturningExecutor +def assignment() -> List[TopicPartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/stream.py#L357) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L474) -Compose a list of functions from this `Stream` and its parents into one -big closure that always returns the transformed record. +Returns the current partition assignment. -This closure is to be used to execute the functions in the stream and to get -the result of the transformations. +**Raises**: -Stream may only contain simple "apply" functions to be able to compose itself -into a returning function. +- `KafkaException`: if a Kafka-based error occurs +- `RuntimeError`: if called on a closed consumer - +**Returns**: -## quixstreams.core.stream.functions.update +`List[TopicPartition]`: List of assigned topic+partitions. - + -### UpdateFunction +#### BaseConsumer.set\_sasl\_credentials ```python -class UpdateFunction(StreamFunction) +def set_sasl_credentials(username: str, password: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/update.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L487) -Wrap a function into an "Update" function. +Sets the SASL credentials used for this client. -The provided function must accept a value, and it's expected to mutate it -or to perform some side effect. +These credentials will overwrite the old ones, and will be used the next +time the client needs to authenticate. +This method will not disconnect existing broker connections that have been +established with the old credentials. +This method is applicable only to SASL PLAIN and SCRAM mechanisms. -The result of the callback is always ignored, and the original input is passed -downstream. +**Arguments**: - +- `username` (`str`): your username +- `password` (`str`): your password -### UpdateWithMetadataFunction + + +#### BaseConsumer.incremental\_assign ```python -class UpdateWithMetadataFunction(StreamFunction) +def incremental_assign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/update.py#L34) - -Wrap a function into an "Update" function. - -The provided function must accept a value, a key, and a timestamp. -The callback is expected to mutate the value or to perform some side effect with it. - -The result of the callback is always ignored, and the original input is passed -downstream. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L501) -## quixstreams.core.stream.functions +Assign new partitions. - +Can be called outside the `Consumer` `on_assign` callback (multiple times). +Partitions immediately show on `Consumer.assignment()`. -## quixstreams.core.stream.functions.types +Any additional partitions besides the ones passed during the `Consumer` +`on_assign` callback will NOT be associated with the consumer group. - +**Arguments**: -## quixstreams.core.stream.functions.utils +- `partitions` (`List[TopicPartition]`): a list of topic partitions - + -#### pickle\_copier +#### BaseConsumer.incremental\_unassign ```python -def pickle_copier(obj: T) -> Callable[[], T] +def incremental_unassign(partitions: List[TopicPartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/utils.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L515) -A utility function to copy objects using a "pickle" library. +Revoke partitions. -On average, it's faster than "copy.deepcopy". -It accepts an object and returns a callable creating copies of this object. +Can be called outside an on_revoke callback. **Arguments**: -- `obj`: an object to copy - - - -## quixstreams.core.stream.functions.transform +- `partitions` (`List[TopicPartition]`): a list of topic partitions - + -### TransformFunction +#### BaseConsumer.close ```python -class TransformFunction(StreamFunction) +def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/transform.py#L9) - -Wrap a function into a "Transform" function. - -The provided callback must accept a value, a key and a timestamp. -It's expected to return a new value, new key and new timestamp. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L525) -This function must be used with caution, because it can technically change the -key. -It's supposed to be used by the library internals and not be a part of the public -API. +Close down and terminate the Kafka Consumer. -The result of the callback will always be passed downstream. +Actions performed: - +- Stops consuming. +- Commits offsets, unless the consumer property 'enable.auto.commit' is set to False. +- Leaves the consumer group. -## quixstreams.core.stream.functions.filter +Registered callbacks may be called from this method, +see `poll()` for more info. - + -### FilterFunction +#### BaseConsumer.consumer\_group\_metadata ```python -class FilterFunction(StreamFunction) +def consumer_group_metadata() -> GroupMetadata ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/filter.py#L9) - -Wraps a function into a "Filter" function. -The result of a Filter function is interpreted as boolean. -If it's `True`, the input will be return downstream. -If it's `False`, the `Filtered` exception will be raised to signal that the -value is filtered out. - - - -### FilterWithMetadataFunction +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/consumer.py#L542) -```python -class FilterWithMetadataFunction(StreamFunction) -``` +Used by the producer during consumer offset sending for an EOS transaction. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/filter.py#L32) + -Wraps a function into a "Filter" function. +## quixstreams.kafka.producer -The passed callback must accept value, key, and timestamp, and it's expected to -return a boolean-like result. + -If the result is `True`, the input will be passed downstream. -Otherwise, the value will be filtered out. +### Producer - +```python +class Producer() +``` -## quixstreams.core.stream.functions.base +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L42) - + -### StreamFunction +#### Producer.\_\_init\_\_ ```python -class StreamFunction(abc.ABC) +def __init__(broker_address: Union[str, ConnectionConfig], + logger: logging.Logger = logger, + error_callback: Callable[[KafkaError], None] = _default_error_cb, + extra_config: Optional[dict] = None, + flush_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/base.py#L10) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L43) -A base class for all the streaming operations in Quix Streams. - -It provides a `get_executor` method to return a closure to be called with the input -values. +A wrapper around `confluent_kafka.Producer`. - +It initializes `confluent_kafka.Producer` on demand +avoiding network calls during `__init__`, provides typing info for methods +and some reasonable defaults. -#### StreamFunction.get\_executor +**Arguments**: -```python -@abc.abstractmethod -def get_executor(*child_executors: VoidExecutor) -> VoidExecutor -``` +- `broker_address`: Connection settings for Kafka. +Accepts string with Kafka broker host and port formatted as `:`, +or a ConnectionConfig object if authentication is required. +- `logger`: a Logger instance to attach librdkafka logging to +- `error_callback`: callback used for producer errors +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Producer` as is. +Note: values passed as arguments override values in `extra_config`. +- `flush_timeout`: The time the producer is waiting for all messages to be delivered. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/base.py#L24) + -Returns a wrapper to be called on a value, key, timestamp and headers. - - - -## quixstreams.core.stream.functions.apply - - - -### ApplyFunction +#### Producer.produce ```python -class ApplyFunction(StreamFunction) +def produce(topic: str, + value: Optional[Union[str, bytes]] = None, + key: Optional[Union[str, bytes]] = None, + headers: Optional[Headers] = None, + partition: Optional[int] = None, + timestamp: Optional[int] = None, + poll_timeout: float = 5.0, + buffer_error_max_tries: int = 3, + on_delivery: Optional[DeliveryCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/apply.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L81) -Wrap a function into "Apply" function. +Produce a message to a topic. -The provided callback is expected to return a new value based on input, -and its result will always be passed downstream. +It also polls Kafka for callbacks before producing to minimize +the probability of `BufferError`. +If `BufferError` still happens, the method will poll Kafka with timeout +to free up the buffer and try again. - +**Arguments**: -### ApplyWithMetadataFunction +- `topic`: topic name +- `value`: message value +- `key`: message key +- `headers`: message headers +- `partition`: topic partition +- `timestamp`: message timestamp +- `poll_timeout`: timeout for `poll()` call in case of `BufferError` +- `buffer_error_max_tries`: max retries for `BufferError`. +Pass `0` to not retry after `BufferError`. +- `on_delivery`: the delivery callback to be triggered on `poll()` +for the produced message. + + + +#### Producer.poll ```python -class ApplyWithMetadataFunction(StreamFunction) +def poll(timeout: float = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/core/stream/functions/apply.py#L51) - -Wrap a function into "Apply" function. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L142) -The provided function is expected to accept value, and timestamp and return -a new value based on input, -and its result will always be passed downstream. +Polls the producer for events and calls `on_delivery` callbacks. - +**Arguments**: -## quixstreams.core +- `timeout`: poll timeout seconds; Default: 0 (unlike others) +> NOTE: -1 will hang indefinitely if there are no messages to acknowledge - + -## quixstreams.processing +#### Producer.flush - +```python +def flush(timeout: Optional[float] = None) -> int +``` -## quixstreams.processing.context +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L150) - +Wait for all messages in the Producer queue to be delivered. -### ProcessingContext +**Arguments**: -```python -@dataclasses.dataclass -class ProcessingContext() -``` +- `timeout` (`float`): time to attempt flushing (seconds). +None use producer default or -1 is infinite. Default: None -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/context.py#L23) +**Returns**: -A class to share processing-related objects -between `Application` and `StreamingDataFrame` instances. +number of messages remaining to flush - + -#### ProcessingContext.store\_offset +### TransactionalProducer ```python -def store_offset(topic: str, partition: int, offset: int) +class TransactionalProducer(Producer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/context.py#L47) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/kafka/producer.py#L181) -Store the offset of the processed message to the checkpoint. +A separate producer class used only internally for transactions +(transactions are only needed when using a consumer). -**Arguments**: + -- `topic`: topic name -- `partition`: partition number -- `offset`: message offset +## quixstreams.models.serializers - + -#### ProcessingContext.init\_checkpoint +## quixstreams.models.serializers.avro + + + +### AvroSerializer ```python -def init_checkpoint() +class AvroSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/context.py#L57) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L26) -Initialize a new checkpoint - - + -#### ProcessingContext.commit\_checkpoint +#### AvroSerializer.\_\_init\_\_ ```python -def commit_checkpoint(force: bool = False) +def __init__( + schema: Schema, + strict: bool = False, + strict_allow_default: bool = False, + disable_tuple_notation: bool = False, + schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, + schema_registry_serialization_config: Optional[ + SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/context.py#L72) - -Attempts finalizing the current Checkpoint only if the Checkpoint is "expired", +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L27) -or `force=True` is passed, otherwise do nothing. +Serializer that returns data in Avro format. -To finalize: the Checkpoint will be committed if it has any stored offsets, -else just close it. A new Checkpoint is then created. +For more information see fastavro [schemaless_writer](https://fastavro.readthedocs.io/en/latest/writer.html#fastavro._write_py.schemaless_writer) method. **Arguments**: -- `force`: if `True`, commit the Checkpoint before its expiration deadline. - - - -## quixstreams.processing.pausing +- `schema`: The avro schema. +- `strict`: If set to True, an error will be raised if records do not contain exactly the same fields that the schema states. +Default - `False` +- `strict_allow_default`: If set to True, an error will be raised if records do not contain exactly the same fields that the schema states unless it is a missing field that has a default value in the schema. +Default - `False` +- `disable_tuple_notation`: If set to True, tuples will not be treated as a special case. Therefore, using a tuple to indicate the type of a record will not work. +Default - `False` +- `schema_registry_client_config`: If provided, serialization is offloaded to Confluent's AvroSerializer. +Default - `None` +- `schema_registry_serialization_config`: Additional configuration for Confluent's AvroSerializer. +Default - `None` +>***NOTE:*** `schema_registry_client_config` must also be set. - + -### PausingManager +### AvroDeserializer ```python -class PausingManager() +class AvroDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/pausing.py#L15) - -A class to temporarily pause topic partitions and resume them after -the timeout is elapsed. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L112) - + -#### PausingManager.pause +#### AvroDeserializer.\_\_init\_\_ ```python -def pause(topic: str, partition: int, offset_to_seek: int, - resume_after: float) +def __init__( + schema: Optional[Schema] = None, + reader_schema: Optional[Schema] = None, + return_record_name: bool = False, + return_record_name_override: bool = False, + return_named_type: bool = False, + return_named_type_override: bool = False, + handle_unicode_errors: str = "strict", + schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None +) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/pausing.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L113) -Pause the topic-partition for a certain period of time. +Deserializer that parses data from Avro. -This method is supposed to be called in case of backpressure from Sinks. +For more information see fastavro [schemaless_reader](https://fastavro.readthedocs.io/en/latest/reader.html#fastavro._read_py.schemaless_reader) method. - +**Arguments**: -#### PausingManager.is\_paused +- `schema`: The Avro schema. +- `reader_schema`: If the schema has changed since being written then the new schema can be given to allow for schema migration. +Default - `None` +- `return_record_name`: If true, when reading a union of records, the result will be a tuple where the first value is the name of the record and the second value is the record itself. +Default - `False` +- `return_record_name_override`: If true, this will modify the behavior of return_record_name so that the record name is only returned for unions where there is more than one record. For unions that only have one record, this option will make it so that the record is returned by itself, not a tuple with the name. +Default - `False` +- `return_named_type`: If true, when reading a union of named types, the result will be a tuple where the first value is the name of the type and the second value is the record itself NOTE: Using this option will ignore return_record_name and return_record_name_override. +Default - `False` +- `return_named_type_override`: If true, this will modify the behavior of return_named_type so that the named type is only returned for unions where there is more than one named type. For unions that only have one named type, this option will make it so that the named type is returned by itself, not a tuple with the name. +Default - `False` +- `handle_unicode_errors`: Should be set to a valid string that can be used in the errors argument of the string decode() function. +Default - `"strict"` +- `schema_registry_client_config`: If provided, deserialization is offloaded to Confluent's AvroDeserializer. +Default - `None` + + + +## quixstreams.models.serializers.exceptions + + + +### IgnoreMessage ```python -def is_paused(topic: str, partition: int) -> bool +class IgnoreMessage(exceptions.QuixException) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/pausing.py#L68) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/exceptions.py#L54) -Check if the topic-partition is already paused +Raise this exception from Deserializer.__call__ in order to ignore the processing +of the particular message. - + -#### PausingManager.resume\_if\_ready +## quixstreams.models.serializers.json + + + +### JSONSerializer ```python -def resume_if_ready() +class JSONSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/pausing.py#L74) - -Resume consuming from topic-partitions after the wait period has elapsed. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L32) - + -#### PausingManager.revoke +#### JSONSerializer.\_\_init\_\_ ```python -def revoke(topic: str, partition: int) +def __init__( + dumps: Callable[[Any], Union[str, bytes]] = default_dumps, + schema: Optional[Mapping] = None, + validator: Optional[Validator] = None, + schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, + schema_registry_serialization_config: Optional[ + SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/processing/pausing.py#L94) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L33) -Remove partition from the list of paused TPs if it's revoked +Serializer that returns data in json format. - +**Arguments**: -## quixstreams.sinks.core.influxdb3 +- `dumps`: a function to serialize objects to json. +Default - :py:func:`quixstreams.utils.json.dumps` +- `schema`: A schema used to validate the data using [`jsonschema.Draft202012Validator`](https://python-jsonschema.readthedocs.io/en/stable/api/jsonschema/validators/`jsonschema.validators.Draft202012Validator`). +Default - `None` +- `validator`: A jsonschema validator used to validate the data. Takes precedences over the schema. +Default - `None` +- `schema_registry_client_config`: If provided, serialization is offloaded to Confluent's JSONSerializer. +Default - `None` +- `schema_registry_serialization_config`: Additional configuration for Confluent's JSONSerializer. +Default - `None` +>***NOTE:*** `schema_registry_client_config` must also be set. - + -### InfluxDB3Sink +### JSONDeserializer ```python -class InfluxDB3Sink(BatchingSink) +class JSONDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/influxdb3.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L119) - + -#### InfluxDB3Sink.\_\_init\_\_ +#### JSONDeserializer.\_\_init\_\_ ```python -def __init__(token: str, - host: str, - organization_id: str, - database: str, - measurement: str, - fields_keys: Iterable[str] = (), - tags_keys: Iterable[str] = (), - time_key: Optional[str] = None, - time_precision: WritePrecision = WritePrecision.MS, - include_metadata_tags: bool = False, - batch_size: int = 1000, - enable_gzip: bool = True, - request_timeout_ms: int = 10_000, - debug: bool = False) +def __init__( + loads: Callable[[Union[bytes, bytearray]], Any] = default_loads, + schema: Optional[Mapping] = None, + validator: Optional[Validator] = None, + schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None +) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/influxdb3.py#L24) - -A connector to sink processed data to InfluxDB v3. - -It batches the processed records in memory per topic partition, converts -them to the InfluxDB format, and flushes them to InfluxDB at the checkpoint. - -The InfluxDB sink transparently handles backpressure if the destination instance -cannot accept more data at the moment -(e.g., when InfluxDB returns an HTTP 429 error with the "retry_after" header set). -When this happens, the sink will notify the Application to pause consuming -from the backpressured topic partition until the "retry_after" timeout elapses. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L120) ->***NOTE***: InfluxDB3Sink can accept only dictionaries. -> If the record values are not dicts, you need to convert them to dicts before -> sinking. +Deserializer that parses data from JSON **Arguments**: -- `token`: InfluxDB access token -- `host`: InfluxDB host in format "https://" -- `organization_id`: InfluxDB organization_id -- `database`: database name -- `fields_keys`: a list of keys to be used as "fields" when writing to InfluxDB. -If present, it must not overlap with "tags_keys". -If empty, the whole record value will be used. ->***NOTE*** The fields' values can only be strings, floats, integers, or booleans. -Default - `()`. -- `tags_keys`: a list of keys to be used as "tags" when writing to InfluxDB. -If present, it must not overlap with "fields_keys". -These keys will be popped from the value dictionary -automatically because InfluxDB doesn't allow the same keys be -both in tags and fields. -If empty, no tags will be sent. ->***NOTE***: InfluxDB client always converts tag values to strings. -Default - `()`. -- `time_key`: a key to be used as "time" when writing to InfluxDB. -By default, the record timestamp will be used with "ms" time precision. -When using a custom key, you may need to adjust the `time_precision` setting -to match. -- `time_precision`: a time precision to use when writing to InfluxDB. -- `include_metadata_tags`: if True, includes record's key, topic, -and partition as tags. -Default - `False`. -- `batch_size`: how many records to write to InfluxDB in one request. -Note that it only affects the size of one write request, and not the number -of records flushed on each checkpoint. -Default - `1000`. -- `enable_gzip`: if True, enables gzip compression for writes. -Default - `True`. -- `request_timeout_ms`: an HTTP request timeout in milliseconds. -Default - `10000`. -- `debug`: if True, print debug logs from InfluxDB client. -Default - `False`. - - - -## quixstreams.sinks.core +- `loads`: function to parse json from bytes. +Default - :py:func:`quixstreams.utils.json.loads`. +- `schema`: A schema used to validate the data using [`jsonschema.Draft202012Validator`](https://python-jsonschema.readthedocs.io/en/stable/api/jsonschema/validators/`jsonschema.validators.Draft202012Validator`). +Default - `None` +- `validator`: A jsonschema validator used to validate the data. Takes precedences over the schema. +Default - `None` +- `schema_registry_client_config`: If provided, deserialization is offloaded to Confluent's JSONDeserializer. +Default - `None` - + -## quixstreams.sinks.core.csv +## quixstreams.models.serializers.protobuf - + -### CSVSink +### ProtobufSerializer ```python -class CSVSink(BatchingSink) +class ProtobufSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/csv.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L24) - + -#### CSVSink.\_\_init\_\_ +#### ProtobufSerializer.\_\_init\_\_ ```python -def __init__(path: str, - dialect: str = "excel", - key_serializer: Callable[[Any], str] = str, - value_serializer: Callable[[Any], str] = json.dumps) +def __init__( + msg_type: Message, + deterministic: bool = False, + ignore_unknown_fields: bool = False, + schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, + schema_registry_serialization_config: Optional[ + SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/csv.py#L10) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L25) -A base CSV sink that writes data from all assigned partitions to a single file. - -It's best to be used for local debugging. +Serializer that returns data in protobuf format. -Column format: - (key, value, timestamp, topic, partition, offset) +Serialisation from a python dictionary can have a significant performance impact. An alternative is to pass the serializer an object of the `msg_type` class. **Arguments**: -- `path`: a path to CSV file -- `dialect`: a CSV dialect to use. It affects quoting and delimiters. -See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info. -Default - `"excel"`. -- `key_serializer`: a callable to convert keys to strings. -Default - `str`. -- `value_serializer`: a callable to convert values to strings. -Default - `json.dumps`. - - - -## quixstreams.sinks - - - -## quixstreams.sinks.community.postgresql +- `msg_type`: protobuf message class. +- `deterministic`: If true, requests deterministic serialization of the protobuf, with predictable ordering of map keys +Default - `False` +- `ignore_unknown_fields`: If True, do not raise errors for unknown fields. +Default - `False` +- `schema_registry_client_config`: If provided, serialization is offloaded to Confluent's ProtobufSerializer. +Default - `None` +- `schema_registry_serialization_config`: Additional configuration for Confluent's ProtobufSerializer. +Default - `None` +>***NOTE:*** `schema_registry_client_config` must also be set. - + -### PostgreSQLSink +### ProtobufDeserializer ```python -class PostgreSQLSink(BatchingSink) +class ProtobufDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/postgresql.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L110) - + -#### PostgreSQLSink.\_\_init\_\_ +#### ProtobufDeserializer.\_\_init\_\_ ```python -def __init__(host: str, - port: int, - dbname: str, - user: str, - password: str, - table_name: str, - schema_auto_update: bool = True, - **kwargs) +def __init__( + msg_type: Message, + use_integers_for_enums: bool = False, + preserving_proto_field_name: bool = False, + to_dict: bool = True, + schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, + schema_registry_serialization_config: Optional[ + SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/postgresql.py#L49) - -A connector to sink topic data to PostgreSQL. - -**Arguments**: - -- `host`: PostgreSQL server address. -- `port`: PostgreSQL server port. -- `dbname`: PostgreSQL database name. -- `user`: Database user name. -- `password`: Database user password. -- `table_name`: PostgreSQL table name. -- `schema_auto_update`: Automatically update the schema when new columns are detected. -- `ddl_timeout`: Timeout for DDL operations such as table creation or schema updates. -- `kwargs`: Additional parameters for `psycopg2.connect`. - - - -## quixstreams.sinks.community.file.formats.parquet - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L111) -### ParquetFormat +Deserializer that parses protobuf data into a dictionary suitable for a StreamingDataframe. -```python -class ParquetFormat(Format) -``` +Deserialisation to a python dictionary can have a significant performance impact. You can disable this behavior using `to_dict`, in that case the protobuf message will be used as the StreamingDataframe row value. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L16) +**Arguments**: -Serializes batches of messages into Parquet format. +- `msg_type`: protobuf message class. +- `use_integers_for_enums`: If true, use integers instead of enum names. +Default - `False` +- `preserving_proto_field_name`: If True, use the original proto field names as +defined in the .proto file. If False, convert the field names to +lowerCamelCase. +Default - `False` +- `to_dict`: If false, return the protobuf message instead of a dict. +Default - `True` +- `schema_registry_client_config`: If provided, deserialization is offloaded to Confluent's ProtobufDeserializer. +Default - `None` +- `schema_registry_serialization_config`: Additional configuration for Confluent's ProtobufDeserializer. +Default - `None` +>***NOTE:*** `schema_registry_client_config` must also be set. -This class provides functionality to serialize a `SinkBatch` into bytes -in Parquet format using PyArrow. It allows setting the file extension -and compression algorithm used for the Parquet files. + -This format does not support appending to existing files. +## quixstreams.models.serializers.schema\_registry - + -#### ParquetFormat.\_\_init\_\_ +### SchemaRegistryClientConfig ```python -def __init__(file_extension: str = ".parquet", - compression: Compression = "snappy") -> None +class SchemaRegistryClientConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/schema_registry.py#L22) -Initializes the ParquetFormat. +Configuration required to establish the connection with a Schema Registry. **Arguments**: -- `file_extension`: The file extension to use for output files. -Defaults to ".parquet". -- `compression`: The compression algorithm to use for Parquet files. -Allowed values are "none", "snappy", "gzip", "brotli", "lz4", -or "zstd". Defaults to "snappy". +- `url`: Schema Registry URL. +- `ssl_ca_location`: Path to CA certificate file used to verify the +Schema Registry's private key. +- `ssl_key_location`: Path to the client's private key (PEM) used for +authentication. +>***NOTE:*** `ssl_certificate_location` must also be set. +- `ssl_certificate_location`: Path to the client's public key (PEM) used +for authentication. +>***NOTE:*** May be set without `ssl_key_location` if the private key is +stored within the PEM as well. +- `basic_auth_user_info`: Client HTTP credentials in the form of +`username:password`. +>***NOTE:*** By default, userinfo is extracted from the URL if present. - + -#### ParquetFormat.file\_extension +### SchemaRegistrySerializationConfig ```python -@property -def file_extension() -> str +class SchemaRegistrySerializationConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L47) - -Returns the file extension used for output files. - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/schema_registry.py#L48) -The file extension as a string. +Configuration that instructs Serializer how to handle communication with a - +Schema Registry. -#### ParquetFormat.serialize +**Arguments**: -```python -def serialize(batch: SinkBatch) -> bytes -``` +- `auto_register_schemas`: If True, automatically register the configured schema +with Confluent Schema Registry if it has not previously been associated with the +relevant subject (determined via subject.name.strategy). Defaults to True. +- `normalize_schemas`: Whether to normalize schemas, which will transform schemas +to have a consistent format, including ordering properties and references. +- `use_latest_version`: Whether to use the latest subject version for serialization. +>***NOTE:*** There is no check that the latest schema is backwards compatible with the +object being serialized. Defaults to False. +- `subject_name_strategy`: Callable(SerializationContext, str) -> str +Defines how Schema Registry subject names are constructed. Standard naming +strategies are defined in the confluent_kafka.schema_registry namespace. +Defaults to topic_subject_name_strategy. +- `skip_known_types`: Whether or not to skip known types when resolving +schema dependencies. Defaults to False. +- `reference_subject_name_strategy`: Defines how Schema Registry subject names +for schema references are constructed. Defaults to reference_subject_name_strategy. +- `use_deprecated_format`: Specifies whether the Protobuf serializer should +serialize message indexes without zig-zag encoding. This option must be explicitly +configured as older and newer Protobuf producers are incompatible. +If the consumers of the topic being produced to are using confluent-kafka-python <1.8, +then this property must be set to True until all old consumers have been upgraded. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L55) + -Serializes a `SinkBatch` into bytes in Parquet format. +## quixstreams.models.serializers.simple\_types -Each item in the batch is converted into a dictionary with "_timestamp", -"_key", and the keys from the message value. If the message key is in -bytes, it is decoded to a string. + -Missing fields in messages are filled with `None` to ensure all rows -have the same columns. +### BytesDeserializer -**Arguments**: +```python +class BytesDeserializer(Deserializer) +``` -- `batch`: The `SinkBatch` to serialize. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L56) -**Returns**: +A deserializer to bypass bytes without any changes -The serialized batch as bytes in Parquet format. + - +### BytesSerializer -## quixstreams.sinks.community.file.formats +```python +class BytesSerializer(Serializer) +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L65) -### InvalidFormatError +A serializer to bypass bytes without any changes + + + +### StringDeserializer ```python -class InvalidFormatError(Exception) +class StringDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/__init__.py#L17) - -Raised when the format is specified incorrectly. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L74) - + -#### resolve\_format +#### StringDeserializer.\_\_init\_\_ ```python -def resolve_format(format: Union[FormatName, Format]) -> Format +def __init__(codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/__init__.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L75) -Resolves the format into a `Format` instance. +Deserializes bytes to strings using the specified encoding. **Arguments**: -- `format`: The format to resolve, either a format name ("json", -"parquet") or a `Format` instance. +- `codec`: string encoding +A wrapper around `confluent_kafka.serialization.StringDeserializer`. -**Raises**: + -- `InvalidFormatError`: If the format name is invalid. +### IntegerDeserializer -**Returns**: +```python +class IntegerDeserializer(Deserializer) +``` -An instance of `Format` corresponding to the specified format. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L93) - +Deserializes bytes to integers. -## quixstreams.sinks.community.file.formats.json +A wrapper around `confluent_kafka.serialization.IntegerDeserializer`. - + -### JSONFormat +### DoubleDeserializer ```python -class JSONFormat(Format) +class DoubleDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L14) - -Serializes batches of messages into JSON Lines format with optional gzip -compression. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L111) -This class provides functionality to serialize a `SinkBatch` into bytes -in JSON Lines format. It supports optional gzip compression and allows -for custom JSON serialization through the `dumps` parameter. +Deserializes float to IEEE 764 binary64. -This format supports appending to existing files. +A wrapper around `confluent_kafka.serialization.DoubleDeserializer`. - + -#### JSONFormat.\_\_init\_\_ +### StringSerializer ```python -def __init__(file_extension: str = ".jsonl", - compress: bool = False, - dumps: Optional[Callable[[Any], str]] = None) -> None +class StringSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L28) - -Initializes the JSONFormat. - -**Arguments**: - -- `file_extension`: The file extension to use for output files. -Defaults to ".jsonl". -- `compress`: If `True`, compresses the output using gzip and -appends ".gz" to the file extension. Defaults to `False`. -- `dumps`: A custom function to serialize objects to JSON-formatted -strings. If provided, the `compact` option is ignored. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L129) - + -#### JSONFormat.file\_extension +#### StringSerializer.\_\_init\_\_ ```python -@property -def file_extension() -> str +def __init__(codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L57) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L130) -Returns the file extension used for output files. +Serializes strings to bytes using the specified encoding. -**Returns**: +**Arguments**: -The file extension as a string. +- `codec`: string encoding - + -#### JSONFormat.serialize +### IntegerSerializer ```python -def serialize(batch: SinkBatch) -> bytes +class IntegerSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L65) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L142) -Serializes a `SinkBatch` into bytes in JSON Lines format. +Serializes integers to bytes -Each item in the batch is converted into a JSON object with -"_timestamp", "_key", and "_value" fields. If the message key is -in bytes, it is decoded to a string. + -**Arguments**: +### DoubleSerializer -- `batch`: The `SinkBatch` to serialize. +```python +class DoubleSerializer(Serializer) +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L155) -The serialized batch in JSON Lines format, optionally -compressed with gzip. +Serializes floats to bytes - + -## quixstreams.sinks.community.file.formats.base +## quixstreams.models.serializers.base - + -### Format +### SerializationContext ```python -class Format(ABC) +class SerializationContext(_SerializationContext) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L8) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/base.py#L24) -Base class for formatting batches in file sinks. +Provides additional context for message serialization/deserialization. -This abstract base class defines the interface for batch formatting -in file sinks. Subclasses should implement the `file_extension` -property and the `serialize` method to define how batches are -formatted and saved. +Every `Serializer` and `Deserializer` receives an instance of `SerializationContext` - + -#### Format.file\_extension +### Deserializer ```python -@property -@abstractmethod -def file_extension() -> str +class Deserializer(abc.ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L20) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/base.py#L44) -Returns the file extension used for output files. + -**Returns**: +#### Deserializer.\_\_init\_\_ -The file extension as a string. +```python +def __init__(*args, **kwargs) +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/base.py#L45) -#### Format.supports\_append +A base class for all Deserializers + + + +#### Deserializer.split\_values ```python @property -@abstractmethod -def supports_append() -> bool +def split_values() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L30) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/base.py#L51) -Indicates if the format supports appending data to an existing file. +Return True if the deserialized message should be considered as Iterable +and each item in it should be processed as a separate message. -**Returns**: + -True if appending is supported, otherwise False. +### Serializer - +```python +class Serializer(abc.ABC) +``` -#### Format.serialize +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/base.py#L62) + +A base class for all Serializers + + + +#### Serializer.extra\_headers ```python -@abstractmethod -def serialize(batch: SinkBatch) -> bytes +@property +def extra_headers() -> HeadersMapping ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/base.py#L68) -Serializes a batch of messages into bytes. +Informs producer to set additional headers -**Arguments**: +for the message it will be serializing -- `batch`: The batch of messages to serialize. +Must return a dictionary with headers. +Keys must be strings, and values must be strings, bytes or None. **Returns**: -The serialized batch as bytes. +dict with headers - + -## quixstreams.sinks.community.file.sink +## quixstreams.models.serializers.quix - + -### FileSink +### QuixDeserializer ```python -class FileSink(BatchingSink) +class QuixDeserializer(JSONDeserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/sink.py#L11) - -A sink that writes data batches to files using configurable formats and -destinations. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L76) -The sink groups messages by their topic and partition, ensuring data from the -same source is stored together. Each batch is serialized using the specified -format (e.g., JSON, Parquet) before being written to the configured -destination. +Handles Deserialization for any Quix-formatted topic. -The destination determines the storage location and write behavior. By default, -it uses LocalDestination for writing to the local filesystem, but can be -configured to use other storage backends (e.g., cloud storage). +Parses JSON data from either `TimeseriesData` and `EventData` (ignores the rest). - + -#### FileSink.\_\_init\_\_ +#### QuixDeserializer.\_\_init\_\_ ```python -def __init__(directory: str = "", - format: Union[FormatName, Format] = "json", - destination: Optional[Destination] = None) -> None +def __init__(loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/sink.py#L25) - -Initialize the FileSink with the specified configuration. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L83) **Arguments**: -- `directory`: Base directory path for storing files. Defaults to -current directory. -- `format`: Data serialization format, either as a string -("json", "parquet") or a Format instance. -- `destination`: Storage destination handler. Defaults to -LocalDestination if not specified. +- `loads`: function to parse json from bytes. +Default - :py:func:`quixstreams.utils.json.loads`. - + -#### FileSink.write +#### QuixDeserializer.split\_values ```python -def write(batch: SinkBatch) -> None +@property +def split_values() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/sink.py#L46) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L100) -Write a batch of data using the configured format and destination. +Each Quix message might contain data for multiple Rows. +This property informs the downstream processors about that, so they can +expect an Iterable instead of Mapping. -The method performs the following steps: -1. Serializes the batch data using the configured format -2. Writes the serialized data to the destination -3. Handles any write failures by raising a backpressure error + -**Arguments**: +#### QuixDeserializer.deserialize -- `batch`: The batch of data to write. +```python +def deserialize(model_key: str, value: Union[List[Mapping], + Mapping]) -> Iterable[Mapping] +``` -**Raises**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L153) -- `SinkBackpressureError`: If the write operation fails, indicating -that the sink needs backpressure with a 5-second retry delay. +Deserialization function for particular data types (Timeseries or EventData). - +**Arguments**: -## quixstreams.sinks.community.file.destinations.local +- `model_key`: value of "__Q_ModelKey" message header +- `value`: deserialized JSON value of the message, list or dict - +**Returns**: -### LocalDestination +Iterable of dicts -```python -class LocalDestination(Destination) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L15) +### QuixSerializer -A destination that writes data to the local filesystem. +```python +class QuixSerializer(JSONSerializer) +``` -Handles writing data to local files with support for both creating new files -and appending to existing ones. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L274) - + -#### LocalDestination.\_\_init\_\_ +#### QuixSerializer.\_\_init\_\_ ```python -def __init__(append: bool = False) -> None +def __init__(as_legacy: bool = True, + dumps: Callable[[Any], Union[str, bytes]] = default_dumps) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L22) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L278) -Initialize the local destination. +Serializer that returns data in json format. **Arguments**: -- `append`: If True, append to existing files instead of creating new -ones. Defaults to False. +- `as_legacy`: parse as the legacy format; Default = True +- `dumps`: a function to serialize objects to json. +Default - :py:func:`quixstreams.utils.json.dumps` - + -#### LocalDestination.set\_extension +### QuixTimeseriesSerializer ```python -def set_extension(format: Format) -> None +class QuixTimeseriesSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L32) - -Set the file extension and validate append mode compatibility. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L321) -**Arguments**: +Serialize data to JSON formatted according to Quix Timeseries format. -- `format`: The Format instance that defines the file extension. +The serializable object must be dictionary, and each item must be of `str`, `int`, +`float`, `bytes` or `bytearray` type. +Otherwise, the `SerializationError` will be raised. -**Raises**: +Input: +```python +{'a': 1, 'b': 1.1, 'c': "string", 'd': b'bytes', 'Tags': {'tag1': 'tag'}} +``` -- `ValueError`: If append mode is enabled but the format doesn't -support appending. +Output: +```json +{ + "Timestamps": [123123123], + "NumericValues": {"a": [1], "b": [1.1]}, + "StringValues": {"c": ["string"]}, + "BinaryValues": {"d": ["Ynl0ZXM="]}, + "TagValues": {"tag1": ["tag"]} +} +``` - + -#### LocalDestination.write +### QuixEventsSerializer ```python -def write(data: bytes, batch: SinkBatch) -> None +class QuixEventsSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L43) - -Write data to a local file. - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L409) -- `data`: The serialized data to write. -- `batch`: The batch information containing topic and partition details. +Serialize data to JSON formatted according to Quix EventData format. +The input value is expected to be a dictionary with the following keys: + - "Id" (type `str`, default - "") + - "Value" (type `str`, default - ""), + - "Tags" (type `dict`, default - {}) - +>***NOTE:*** All the other fields will be ignored. -## quixstreams.sinks.community.file.destinations +Input: +```python +{ + "Id": "an_event", + "Value": "any_string", + "Tags": {"tag1": "tag"}} +} +``` - +Output: +```json +{ + "Id": "an_event", + "Value": "any_string", + "Tags": {"tag1": "tag"}}, + "Timestamp":1692703362840389000 +} +``` -## quixstreams.sinks.community.file.destinations.s3 + - +## quixstreams.models.topics.exceptions -### S3BucketNotFoundError + -```python -class S3BucketNotFoundError(Exception) -``` +## quixstreams.models.topics -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L13) + -Raised when the specified S3 bucket does not exist. +## quixstreams.models.topics.manager - + -### S3BucketAccessDeniedError +#### affirm\_ready\_for\_create ```python -class S3BucketAccessDeniedError(Exception) +def affirm_ready_for_create(topics: List[Topic]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L17) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L21) -Raised when the specified S3 bucket access is denied. +Validate a list of topics is ready for creation attempt - +**Arguments**: -### S3Destination +- `topics`: list of `Topic`s + + + +### TopicManager ```python -class S3Destination(Destination) +class TopicManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L31) -A destination that writes data to Amazon S3. +The source of all topic management for a Quix Streams Application. -Handles writing data to S3 buckets using the AWS SDK. Credentials can be -provided directly or via environment variables. +Intended only for internal use by Application. - +To create a Topic, use Application.topic() or generate them directly. -#### S3Destination.\_\_init\_\_ + + +#### TopicManager.\_\_init\_\_ ```python -def __init__(bucket: str, - aws_access_key_id: Optional[str] = getenv("AWS_ACCESS_KEY_ID"), - aws_secret_access_key: Optional[str] = getenv( - "AWS_SECRET_ACCESS_KEY"), - region_name: Optional[str] = getenv("AWS_REGION", - getenv("AWS_DEFAULT_REGION")), - **kwargs) -> None +def __init__(topic_admin: TopicAdmin, + consumer_group: str, + timeout: float = 30, + create_timeout: float = 60, + auto_create_topics: bool = True) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L28) - -Initialize the S3 destination. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L52) **Arguments**: -- `bucket`: Name of the S3 bucket to write to. -- `aws_access_key_id`: AWS access key ID. Defaults to AWS_ACCESS_KEY_ID -environment variable. -- `aws_secret_access_key`: AWS secret access key. Defaults to -AWS_SECRET_ACCESS_KEY environment variable. -- `region_name`: AWS region name. Defaults to AWS_REGION or -AWS_DEFAULT_REGION environment variable. -- `kwargs`: Additional keyword arguments passed to boto3.client. - -**Raises**: - -- `S3BucketNotFoundError`: If the specified bucket doesn't exist. -- `S3BucketAccessDeniedError`: If access to the bucket is denied. +- `topic_admin`: an `Admin` instance (required for some functionality) +- `consumer_group`: the consumer group (of the `Application`) +- `timeout`: response timeout (seconds) +- `create_timeout`: timeout for topic creation - + -#### S3Destination.write +#### TopicManager.changelog\_topics ```python -def write(data: bytes, batch: SinkBatch) -> None +@property +def changelog_topics() -> Dict[Optional[str], Dict[str, Topic]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L78) - -Write data to S3. - -**Arguments**: - -- `data`: The serialized data to write. -- `batch`: The batch information containing topic and partition details. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L104) - +Note: `Topic`s are the changelogs. -## quixstreams.sinks.community.file.destinations.base +returns: the changelog topic dict, {topic_name: {suffix: Topic}} - + -### Destination +#### TopicManager.all\_topics ```python -class Destination(ABC) +@property +def all_topics() -> Dict[str, Topic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L16) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L113) -Abstract base class for defining where and how data should be stored. +Every registered topic name mapped to its respective `Topic`. -Destinations handle the storage of serialized data, whether that's to local -disk, cloud storage, or other locations. They manage the physical writing of -data while maintaining a consistent directory/path structure based on topics -and partitions. +returns: full topic dict, {topic_name: Topic} - + -#### Destination.set\_directory +#### TopicManager.topic\_config ```python -def set_directory(directory: str) -> None +def topic_config(num_partitions: Optional[int] = None, + replication_factor: Optional[int] = None, + extra_config: Optional[dict] = None) -> TopicConfig ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L223) -Configure the base directory for storing files. +Convenience method for generating a `TopicConfig` with default settings **Arguments**: -- `directory`: The base directory path where files will be stored. +- `num_partitions`: the number of topic partitions +- `replication_factor`: the topic replication factor +- `extra_config`: other optional configuration settings -**Raises**: +**Returns**: -- `ValueError`: If the directory path contains invalid characters. -Only alphanumeric characters (a-zA-Z0-9), spaces, dots, and -underscores are allowed. +a TopicConfig object - + -#### Destination.set\_extension +#### TopicManager.topic ```python -def set_extension(format: Format) -> None +def topic(name: str, + value_deserializer: Optional[DeserializerType] = None, + key_deserializer: Optional[DeserializerType] = "bytes", + value_serializer: Optional[SerializerType] = None, + key_serializer: Optional[SerializerType] = "bytes", + config: Optional[TopicConfig] = None, + timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L45) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L244) -Set the file extension based on the format. +A convenience method for generating a `Topic`. Will use default config options + +as dictated by the TopicManager. **Arguments**: -- `format`: The Format instance that defines the file extension. +- `name`: topic name +- `value_deserializer`: a deserializer type for values +- `key_deserializer`: a deserializer type for keys +- `value_serializer`: a serializer type for values +- `key_serializer`: a serializer type for keys +- `config`: optional topic configurations (for creation/validation) +- `timestamp_extractor`: a callable that returns a timestamp in +milliseconds from a deserialized message. - +**Returns**: -#### Destination.write +Topic object with creation configs + + + +#### TopicManager.register ```python -@abstractmethod -def write(data: bytes, batch: SinkBatch) -> None +def register(topic: Topic) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L290) -Write the serialized data to storage. +Register an already generated :class:`quixstreams.models.topics.Topic` to the topic manager. + +The topic name and config can be updated by the topic manager. **Arguments**: -- `data`: The serialized data to write. -- `batch`: The batch information containing topic, partition and offset -details. +- `topic`: The topic to register - + -## quixstreams.sinks.community.file +#### TopicManager.repartition\_topic - +```python +def repartition_topic(operation: str, + topic_name: str, + value_deserializer: Optional[DeserializerType] = "json", + key_deserializer: Optional[DeserializerType] = "json", + value_serializer: Optional[SerializerType] = "json", + key_serializer: Optional[SerializerType] = "json", + timeout: Optional[float] = None) -> Topic +``` -## quixstreams.sinks.community.bigquery +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L308) - +Create an internal repartition topic. -### BigQuerySink +**Arguments**: -```python -class BigQuerySink(BatchingSink) -``` +- `operation`: name of the GroupBy operation (column name or user-defined). +- `topic_name`: name of the topic the GroupBy is sourced from. +- `value_deserializer`: a deserializer type for values; default - JSON +- `key_deserializer`: a deserializer type for keys; default - JSON +- `value_serializer`: a serializer type for values; default - JSON +- `key_serializer`: a serializer type for keys; default - JSON +- `timeout`: config lookup timeout (seconds); Default 30 -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/bigquery.py#L53) +**Returns**: - +`Topic` object (which is also stored on the TopicManager) -#### BigQuerySink.\_\_init\_\_ + + +#### TopicManager.changelog\_topic ```python -def __init__(project_id: str, - location: str, - dataset_id: str, - table_name: str, - service_account_json: Optional[str] = None, - schema_auto_update: bool = True, - ddl_timeout: float = 10.0, - insert_timeout: float = 10.0, - retry_timeout: float = 30.0, - **kwargs) +def changelog_topic(topic_name: Optional[str], + store_name: str, + config: Optional[TopicConfig] = None, + timeout: Optional[float] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/bigquery.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L348) -A connector to sink processed data to Google Cloud BigQuery. +Performs all the logic necessary to generate a changelog topic based on an -It batches the processed records in memory per topic partition, and flushes them to BigQuery at the checkpoint. +optional "source topic" (aka input/consumed topic). ->***NOTE***: BigQuerySink can accept only dictionaries. -> If the record values are not dicts, you need to convert them to dicts before -> sinking. +Its main goal is to ensure partition counts of the to-be generated changelog +match the source topic, and ensure the changelog topic is compacted. Also +enforces the serialization type. All `Topic` objects generated with this are +stored on the TopicManager. -The column names and types are inferred from individual records. -Each key in the record's dictionary will be inserted as a column to the resulting BigQuery table. +If source topic already exists, defers to the existing topic settings, else +uses the settings as defined by the `Topic` (and its defaults) as generated +by the `TopicManager`. -If the column is not present in the schema, the sink will try to add new nullable columns on the fly with types inferred from individual values. -The existing columns will not be affected. -To disable this behavior, pass `schema_auto_update=False` and define the necessary schema upfront. -The minimal schema must define two columns: "timestamp" of type TIMESTAMP, and "__key" with a type of the expected message key. +In general, users should NOT need this; an Application knows when/how to +generate changelog topics. To turn off changelogs, init an Application with +"use_changelog_topics"=`False`. **Arguments**: -- `project_id`: a Google project id. -- `location`: a BigQuery location. -- `dataset_id`: a BigQuery dataset id. -If the dataset does not exist, the sink will try to create it. -- `table_name`: BigQuery table name. -If the table does not exist, the sink will try to create it with a default schema. -- `service_account_json`: an optional JSON string with service account credentials -to connect to BigQuery. -The internal `google.cloud.bigquery.Client` will use the Application Default Credentials if not provided. -See https://cloud.google.com/docs/authentication/provide-credentials-adc for more info. -Default - `None`. -- `schema_auto_update`: if True, the sink will try to create a dataset and a table if they don't exist. -It will also add missing columns on the fly with types inferred from individual values. -- `ddl_timeout`: a timeout for a single DDL operation (adding tables, columns, etc.). -Default - 10s. -- `insert_timeout`: a timeout for a single INSERT operation. -Default - 10s. -- `retry_timeout`: a total timeout for each request to BigQuery API. -During this timeout, a request can be retried according -to the client's default retrying policy. -- `kwargs`: Additional keyword arguments passed to `bigquery.Client`. +- `topic_name`: name of consumed topic (app input topic) +> NOTE: normally contain any prefixes added by TopicManager.topic() +- `store_name`: name of the store this changelog belongs to +(default, rolling10s, etc.) +- `config`: the changelog topic configuration. Default to `topic_name` configuration or TopicManager default +- `timeout`: config lookup timeout (seconds); Default 30 - +**Returns**: -## quixstreams.sinks.community.kinesis +`Topic` object (which is also stored on the TopicManager) - + -### KinesisStreamNotFoundError +#### TopicManager.create\_topics ```python -class KinesisStreamNotFoundError(Exception) +def create_topics(topics: List[Topic], + timeout: Optional[float] = None, + create_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L23) - -Raised when the specified Kinesis stream does not exist. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L416) - +Creates topics via an explicit list of provided `Topics`. -### KinesisSink +Exists as a way to manually specify what topics to create; otherwise, +`create_all_topics()` is generally simpler. -```python -class KinesisSink(BaseSink) -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L27) +- `topics`: list of `Topic`s +- `timeout`: creation acknowledge timeout (seconds); Default 30 +- `create_timeout`: topic finalization timeout (seconds); Default 60 - + -#### KinesisSink.\_\_init\_\_ +#### TopicManager.create\_all\_topics ```python -def __init__(stream_name: str, - aws_access_key_id: Optional[str] = getenv("AWS_ACCESS_KEY_ID"), - aws_secret_access_key: Optional[str] = getenv( - "AWS_SECRET_ACCESS_KEY"), - region_name: Optional[str] = getenv("AWS_REGION", - getenv("AWS_DEFAULT_REGION")), - value_serializer: Callable[[Any], str] = json.dumps, - key_serializer: Callable[[Any], str] = bytes.decode, - **kwargs) -> None +def create_all_topics(timeout: Optional[float] = None, + create_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L444) -Initialize the KinesisSink. +A convenience method to create all Topic objects stored on this TopicManager. + +If `auto_create_topics` is set to False no topic will be created. **Arguments**: -- `stream_name`: Kinesis stream name. -- `aws_access_key_id`: AWS access key ID. -- `aws_secret_access_key`: AWS secret access key. -- `region_name`: AWS region name (e.g., 'us-east-1'). -- `value_serializer`: Function to serialize the value to string -(defaults to json.dumps). -- `key_serializer`: Function to serialize the key to string -(defaults to bytes.decode). -- `kwargs`: Additional keyword arguments passed to boto3.client. +- `timeout`: creation acknowledge timeout (seconds); Default 30 +- `create_timeout`: topic finalization timeout (seconds); Default 60 - + -#### KinesisSink.add +#### TopicManager.validate\_all\_topics ```python -def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, - topic: str, partition: int, offset: int) -> None +def validate_all_topics(timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L460) -Buffer a record for the Kinesis stream. +Validates all topics exist and changelogs have correct topic and rep factor. -Records are buffered until the batch size reaches 500, at which point -they are sent immediately. If the batch size is less than 500, records -will be sent when the flush method is called. +Issues are pooled and raised as an Exception once inspections are complete. - + -#### KinesisSink.flush +## quixstreams.models.topics.admin + + + +#### convert\_topic\_list ```python -def flush(topic: str, partition: int) -> None +def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L110) - -Flush all buffered records for a given topic-partition. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L29) -This method sends any outstanding records that have not yet been sent -because the batch size was less than 500. It waits for all futures to -complete, ensuring that all records are successfully sent to the Kinesis -stream. +Converts `Topic`s to `ConfluentTopic`s as required for Confluent's - +`AdminClient.create_topic()`. -## quixstreams.sinks.community +**Arguments**: -This module contains Sinks developed and maintained by the members of Quix Streams community. +- `topics`: list of `Topic`s - +**Returns**: -## quixstreams.sinks.community.redis +list of confluent_kafka `ConfluentTopic`s - + -### RedisSink +### TopicAdmin ```python -class RedisSink(BatchingSink) +class TopicAdmin() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/redis.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L52) - +For performing "admin"-level operations on a Kafka cluster, mostly around topics. -#### RedisSink.\_\_init\_\_ +Primarily used to create and inspect topic configurations. -```python -def __init__(host: str, - port: int, - db: int, - value_serializer: Callable[[Any], Union[bytes, str]] = json.dumps, - key_serializer: Optional[Callable[[Any, Any], Union[bytes, - str]]] = None, - password: Optional[str] = None, - socket_timeout: float = 30.0, - **kwargs) -> None -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/redis.py#L22) +#### TopicAdmin.\_\_init\_\_ -A connector to sink processed data to Redis. +```python +def __init__(broker_address: Union[str, ConnectionConfig], + logger: logging.Logger = logger, + extra_config: Optional[Mapping] = None) +``` -It batches the processed records in memory per topic partition, and flushes them to Redis at the checkpoint. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L59) **Arguments**: -- `host`: Redis host. -- `port`: Redis port. -- `db`: Redis DB number. -- `value_serializer`: a callable to serialize the value to string or bytes -(defaults to json.dumps). -- `key_serializer`: an optional callable to serialize the key to string or bytes. -If not provided, the Kafka message key will be used as is. -- `password`: Redis password, optional. -- `socket_timeout`: Redis socket timeout. -Default - 30s. -- `kwargs`: Additional keyword arguments passed to the `redis.Redis` instance. - - - -## quixstreams.sinks.community.iceberg +- `broker_address`: Connection settings for Kafka. +Accepts string with Kafka broker host and port formatted as `:`, +or a ConnectionConfig object if authentication is required. +- `logger`: a Logger instance to attach librdkafka logging to +- `extra_config`: optional configs (generally accepts producer configs) - + -### AWSIcebergConfig +#### TopicAdmin.list\_topics ```python -class AWSIcebergConfig(BaseIcebergConfig) +def list_topics(timeout: float = -1) -> Dict[str, ConfluentTopicMetadata] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L42) - - - -#### AWSIcebergConfig.\_\_init\_\_ +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L91) -```python -def __init__(aws_s3_uri: str, - aws_region: Optional[str] = None, - aws_access_key_id: Optional[str] = None, - aws_secret_access_key: Optional[str] = None, - aws_session_token: Optional[str] = None) -``` +Get a list of topics and their metadata from a Kafka cluster -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L43) +**Arguments**: -Configure IcebergSink to work with AWS Glue. +- `timeout`: response timeout (seconds); Default infinite (-1) -**Arguments**: +**Returns**: -- `aws_s3_uri`: The S3 URI where the table data will be stored -(e.g., 's3://your-bucket/warehouse/'). -- `aws_region`: The AWS region for the S3 bucket and Glue catalog. -- `aws_access_key_id`: the AWS access key ID. -NOTE: can alternatively set the AWS_ACCESS_KEY_ID environment variable -when using AWS Glue. -- `aws_secret_access_key`: the AWS secret access key. -NOTE: can alternatively set the AWS_SECRET_ACCESS_KEY environment variable -when using AWS Glue. -- `aws_session_token`: a session token (or will be generated for you). -NOTE: can alternatively set the AWS_SESSION_TOKEN environment variable when -using AWS Glue. +a dict of topic names and their metadata objects - + -### IcebergSink +#### TopicAdmin.inspect\_topics ```python -class IcebergSink(BatchingSink) +def inspect_topics(topic_names: List[str], + timeout: float = 30) -> Dict[str, Optional[TopicConfig]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L76) - -IcebergSink writes batches of data to an Apache Iceberg table. - -The data will by default include the kafka message key, value, and timestamp. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L102) -It serializes incoming data batches into Parquet format and appends them to the -Iceberg table, updating the table schema as necessary. +A simplified way of getting the topic configurations of the provided topics -Currently, supports Apache Iceberg hosted in: +from the cluster (if they exist). -- AWS +**Arguments**: -Supported data catalogs: +- `topic_names`: a list of topic names +- `timeout`: response timeout (seconds) +>***NOTE***: `timeout` must be >0 here (expects non-neg, and 0 != inf). -- AWS Glue +**Returns**: -**Arguments**: +a dict with topic names and their respective `TopicConfig` -- `table_name`: The name of the Iceberg table. -- `config`: An IcebergConfig with all the various connection parameters. -- `data_catalog_spec`: data cataloger to use (ex. for AWS Glue, "aws_glue"). -- `schema`: The Iceberg table schema. If None, a default schema is used. -- `partition_spec`: The partition specification for the table. -If None, a default is used. + -Example setup using an AWS-hosted Iceberg with AWS Glue: +#### TopicAdmin.create\_topics +```python +def create_topics(topics: List[Topic], + timeout: float = 30, + finalize_timeout: float = 60) ``` -from quixstreams import Application -from quixstreams.sinks.community.iceberg import IcebergSink, AWSIcebergConfig -# Configure S3 bucket credentials -iceberg_config = AWSIcebergConfig( - aws_s3_uri="", aws_region="", aws_access_key_id="", aws_secret_access_key="" -) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L184) -# Configure the sink to write data to S3 with the AWS Glue catalog spec -iceberg_sink = IcebergSink( - table_name="glue.sink-test", - config=iceberg_config, - data_catalog_spec="aws_glue", -) +Create the given list of topics and confirm they are ready. -app = Application(broker_address='localhost:9092', auto_offset_reset="earliest") -topic = app.topic('sink_topic') +Also raises an exception with detailed printout should the creation +fail (it ignores issues for a topic already existing). -# Do some processing here -sdf = app.dataframe(topic=topic).print(metadata=True) +**Arguments**: -# Sink results to the IcebergSink -sdf.sink(iceberg_sink) +- `topics`: a list of `Topic` +- `timeout`: creation acknowledge timeout (seconds) +- `finalize_timeout`: topic finalization timeout (seconds) +>***NOTE***: `timeout` must be >0 here (expects non-neg, and 0 != inf). + -if __name__ == "__main__": - # Start the application - app.run() -``` +## quixstreams.models.topics.topic - + -#### IcebergSink.write +### TopicConfig ```python -def write(batch: SinkBatch) +@dataclasses.dataclass(eq=True) +class TopicConfig() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L174) - -Writes a batch of data to the Iceberg table. - -Implements retry logic to handle concurrent write conflicts. - -**Arguments**: - -- `batch`: The batch of data to write. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L42) - +Represents all kafka-level configuration for a kafka topic. -## quixstreams.sinks.community.pubsub +Generally used by Topic and any topic creation procedures. - + -### PubSubTopicNotFoundError +### Topic ```python -class PubSubTopicNotFoundError(Exception) +class Topic() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L83) -Raised when the specified topic does not exist. +A definition of a Kafka topic. - +Typically created with an `app = quixstreams.app.Application()` instance via +`app.topic()`, and used by `quixstreams.dataframe.StreamingDataFrame` +instance. -### PubSubSink + + +#### Topic.\_\_init\_\_ ```python -class PubSubSink(BaseSink) +def __init__( + name: str, + config: Optional[TopicConfig] = None, + value_deserializer: Optional[DeserializerType] = None, + key_deserializer: Optional[DeserializerType] = BytesDeserializer(), + value_serializer: Optional[SerializerType] = None, + key_serializer: Optional[SerializerType] = BytesSerializer(), + timestamp_extractor: Optional[TimestampExtractor] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L92) -A sink that publishes messages to Google Cloud Pub/Sub. +**Arguments**: - +- `name`: topic name +- `config`: topic configs via `TopicConfig` (creation/validation) +- `value_deserializer`: a deserializer type for values +- `key_deserializer`: a deserializer type for keys +- `value_serializer`: a serializer type for values +- `key_serializer`: a serializer type for keys +- `timestamp_extractor`: a callable that returns a timestamp in +milliseconds from a deserialized message. -#### PubSubSink.\_\_init\_\_ + + +#### Topic.row\_serialize ```python -def __init__(project_id: str, - topic_id: str, - service_account_json: Optional[str] = None, - value_serializer: Callable[[Any], Union[bytes, str]] = json.dumps, - key_serializer: Callable[[Any], str] = bytes.decode, - flush_timeout: int = 5, - **kwargs) -> None +def row_serialize(row: Row, key: Any) -> KafkaMessage ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L140) -Initialize the PubSubSink. +Serialize Row to a Kafka message structure **Arguments**: -- `project_id`: GCP project ID. -- `topic_id`: Pub/Sub topic ID. -- `service_account_json`: an optional JSON string with service account credentials -to connect to Pub/Sub. -The internal `PublisherClient` will use the Application Default Credentials if not provided. -See https://cloud.google.com/docs/authentication/provide-credentials-adc for more info. -Default - `None`. -- `value_serializer`: Function to serialize the value to string or bytes -(defaults to json.dumps). -- `key_serializer`: Function to serialize the key to string -(defaults to bytes.decode). -- `kwargs`: Additional keyword arguments passed to PublisherClient. +- `row`: Row to serialize +- `key`: message key to serialize - +**Returns**: -#### PubSubSink.add +KafkaMessage object with serialized values + + + +#### Topic.row\_deserialize ```python -def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, - topic: str, partition: int, offset: int) -> None +def row_deserialize( + message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L81) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L180) -Publish a message to Pub/Sub. +Deserialize incoming Kafka message to a Row. - +**Arguments**: -#### PubSubSink.flush +- `message`: an object with interface of `confluent_kafka.Message` -```python -def flush(topic: str, partition: int) -> None -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L114) - -Wait for all publish operations to complete successfully. +Row, list of Rows or None if the message is ignored. - + -## quixstreams.sinks.base.sink +## quixstreams.models.topics.utils - + -### BaseSink +#### merge\_headers ```python -class BaseSink(abc.ABC) +def merge_headers(original: KafkaHeaders, + other: HeadersMapping) -> HeadersTuples ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L11) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/utils.py#L10) -This is a base class for all sinks. +Merge two sets of Kafka message headers, overwriting headers in "origin" -Subclass it and implement its methods to create your own sink. +by the values from "other". -Note that Sinks are currently in beta, and their design may change over time. +**Arguments**: - +- `original`: original headers as a list of (key, value) tuples. +- `other`: headers to merge as a dictionary. -#### BaseSink.flush +**Returns**: + +a list of (key, value) tuples. + + + +## quixstreams.models.timestamps + + + +### TimestampType ```python -@abc.abstractmethod -def flush(topic: str, partition: int) +class TimestampType(enum.IntEnum) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/timestamps.py#L8) -This method is triggered by the Checkpoint class when it commits. + -You can use `flush()` to write the batched data to the destination (in case of -a batching sink), or confirm the delivery of the previously sent messages -(in case of a streaming sink). +#### TIMESTAMP\_NOT\_AVAILABLE -If flush() fails, the checkpoint will be aborted. +timestamps not supported by broker - + -#### BaseSink.add +#### TIMESTAMP\_CREATE\_TIME -```python -@abc.abstractmethod -def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, - topic: str, partition: int, offset: int) -``` +message creation time (or source / producer time) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L33) + -This method is triggered on every new processed record being sent to this sink. +#### TIMESTAMP\_LOG\_APPEND\_TIME -You can use it to accumulate batches of data before sending them outside, or -to send results right away in a streaming manner and confirm a delivery later -on flush(). +broker receive time - + -#### BaseSink.on\_paused +### MessageTimestamp ```python -def on_paused(topic: str, partition: int) +class MessageTimestamp() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L51) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/timestamps.py#L14) -This method is triggered when the sink is paused due to backpressure, when -the `SinkBackpressureError` is raised. +Represents a timestamp of incoming Kafka message. -Here you can react to the backpressure events. +It is made pseudo-immutable (i.e. public attributes don't have setters), and +it should not be mutated during message processing. - + -### BatchingSink +#### MessageTimestamp.create ```python -class BatchingSink(BaseSink) +@classmethod +def create(cls, timestamp_type: int, milliseconds: int) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L60) - -A base class for batching sinks, that need to accumulate the data first before -sending it to the external destinatios. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/timestamps.py#L41) -Examples: databases, objects stores, and other destinations where -writing every message is not optimal. +Create a Timestamp object based on data -It automatically handles batching, keeping batches in memory per topic-partition. +from `confluent_kafka.Message.timestamp()`. -You may subclass it and override the `write()` method to implement a custom -batching sink. +If timestamp type is "TIMESTAMP_NOT_AVAILABLE", the milliseconds are set to None - +**Arguments**: -#### BatchingSink.write +- `timestamp_type`: a timestamp type represented as a number +Can be one of: +- "0" - TIMESTAMP_NOT_AVAILABLE, timestamps not supported by broker. +- "1" - TIMESTAMP_CREATE_TIME, message creation time (or source / producer time). +- "2" - TIMESTAMP_LOG_APPEND_TIME, broker receive time. +- `milliseconds`: the number of milliseconds since the epoch (UTC). -```python -@abc.abstractmethod -def write(batch: SinkBatch) -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L83) +Timestamp object -This method implements actual writing to the external destination. + -It may also raise `SinkBackpressureError` if the destination cannot accept new -writes at the moment. -When this happens, the accumulated batch is dropped and the app pauses the -corresponding topic partition. +## quixstreams.models.messagecontext - + -#### BatchingSink.add +### MessageContext ```python -def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, - topic: str, partition: int, offset: int) +class MessageContext() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/messagecontext.py#L4) -Add a new record to in-memory batch. +An object with Kafka message properties. - +It is made pseudo-immutable (i.e. public attributes don't have setters), and +it should not be mutated during message processing. -#### BatchingSink.flush + -```python -def flush(topic: str, partition: int) -``` +## quixstreams.models -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L115) + -Flush an accumulated batch to the destination and drop it afterward. +## quixstreams.models.messages - + -#### BatchingSink.on\_paused +## quixstreams.models.rows -```python -def on_paused(topic: str, partition: int) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L135) +## quixstreams.models.types -When the destination is already backpressure, drop the accumulated batch. + - +### ConfluentKafkaMessageProto -## quixstreams.sinks.base.batch +```python +class ConfluentKafkaMessageProto(Protocol) +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/types.py#L16) -### SinkBatch +An interface of `confluent_kafka.Message`. -```python -class SinkBatch() -``` +Use it to not depend on exact implementation and simplify testing. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/batch.py#L12) +Instances of `confluent_kafka.Message` cannot be directly created from Python, +see https://github.com/confluentinc/confluent-kafka-python/issues/1535. -A batch to accumulate processed data by `BatchingSink` between the checkpoints. + -Batches are created automatically by the implementations of `BatchingSink`. +## quixstreams.platforms -**Arguments**: + -- `topic`: a topic name -- `partition`: a partition number +## quixstreams.platforms.quix.exceptions - + -#### SinkBatch.iter\_chunks +## quixstreams.platforms.quix -```python -def iter_chunks(n: int) -> Iterable[Iterable[SinkItem]] -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/batch.py#L69) +## quixstreams.platforms.quix.checks -Iterate over batch data in chunks of length n. -The last batch may be shorter. + - +#### check\_state\_management\_enabled -## quixstreams.sinks.base +```python +def check_state_management_enabled() +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/checks.py#L11) -## quixstreams.sinks.base.exceptions +Check if State Management feature is enabled for the current deployment on +Quix platform. +If it's disabled, the exception will be raised. - + -### SinkBackpressureError +#### check\_state\_dir ```python -class SinkBackpressureError(QuixException) +def check_state_dir(state_dir: Path) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/exceptions.py#L6) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/checks.py#L28) -An exception to be raised by Sinks during flush() call - -to signal a backpressure event to the application. +Check if Application "state_dir" matches the state dir on Quix platform. -When raised, the app will drop the accumulated sink batch, -pause the corresponding topic partition for -a timeout specified in `retry_after`, and resume it when it's elapsed. +If it doesn't match, the warning will be logged. **Arguments**: -- `retry_after`: a timeout in seconds to pause for -- `topic`: a topic name to pause -- `partition`: a partition number to pause +- `state_dir`: application state_dir path - + -## quixstreams.sinks.base.manager +## quixstreams.platforms.quix.env - + -## quixstreams.sinks.base.item +### QuixEnvironment - +```python +class QuixEnvironment() +``` -## quixstreams.utils +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/env.py#L7) - +Class to access various Quix platform environment settings -## quixstreams.utils.settings + - +#### SDK\_TOKEN -### BaseSettings +noqa: S105 + + + +#### QuixEnvironment.state\_management\_enabled ```python -class BaseSettings(_BaseSettings) +@property +def state_management_enabled() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/utils/settings.py#L10) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/env.py#L19) - +Check whether "State management" is enabled for the current deployment -#### BaseSettings.as\_dict +**Returns**: + +True if state management is enabled, otherwise False + + + +#### QuixEnvironment.deployment\_id ```python -def as_dict(plaintext_secrets: bool = False, - include: Optional[Set[str]] = None) -> dict +@property +def deployment_id() -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/utils/settings.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/env.py#L27) -Dump any non-empty config values as a dictionary. +Return current Quix deployment id. -**Arguments**: - -- `plaintext_secrets`: whether secret values are plaintext or obscured (***) -- `include`: optional list of fields to be included in the dictionary +This variable is meant to be set only by Quix Platform and only +when the application is deployed. **Returns**: -a dictionary +deployment id or None - + -## quixstreams.utils.dicts +#### QuixEnvironment.workspace\_id - +```python +@property +def workspace_id() -> Optional[str] +``` -#### dict\_values +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/env.py#L39) + +Return Quix workspace id if set + +**Returns**: + +workspace id or None + + + +#### QuixEnvironment.portal\_api ```python -def dict_values(d: object) -> List +@property +def portal_api() -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/utils/dicts.py#L4) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/env.py#L47) -Recursively unpacks a set of nested dicts to get a flattened list of leaves, +Return Quix Portal API url if set -where "leaves" are the first non-dict item. +**Returns**: -i.e {"a": {"b": {"c": 1}, "d": 2}, "e": 3} becomes [1, 2, 3] +portal API URL or None -**Arguments**: + -- `d`: initially, a dict (with potentially nested dicts) +#### QuixEnvironment.state\_dir + +```python +@property +def state_dir() -> str +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/env.py#L56) + +Return application state directory on Quix. **Returns**: -a list with all the leaves of the various contained dicts +path to state dir - + -## quixstreams.utils.json +## quixstreams.platforms.quix.api - + -#### dumps +### QuixPortalApiService ```python -def dumps(value: Any) -> bytes +class QuixPortalApiService() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/utils/json.py#L8) - -Serialize to JSON using `orjson` package. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/api.py#L19) -**Arguments**: +A light wrapper around the Quix Portal Api. If used in the Quix Platform, it will +use that workspaces auth token and portal endpoint, else you must provide it. -- `value`: value to serialize to JSON +Function names closely reflect the respective API endpoint, +each starting with the method [GET, POST, etc.] followed by the endpoint path. -**Returns**: +Results will be returned in the form of request's Response.json(), unless something +else is required. Non-200's will raise exceptions. -bytes +See the swagger documentation for more info about the endpoints. - + -#### loads +#### QuixPortalApiService.get\_workspace\_certificate ```python -def loads(value: bytes) -> Any +def get_workspace_certificate(workspace_id: Optional[str] = None, + timeout: float = 30) -> Optional[bytes] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/utils/json.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/api.py#L119) -Deserialize from JSON using `orjson` package. +Get a workspace TLS certificate if available. -Main differences: -- It returns `bytes` -- It doesn't allow non-str keys in dictionaries +Returns `None` if certificate is not specified. **Arguments**: -- `value`: value to deserialize from +- `workspace_id`: workspace id, optional +- `timeout`: request timeout; Default 30 **Returns**: -object - - - -## quixstreams.types +certificate as bytes if present, or None - + -## quixstreams.models.timestamps +## quixstreams.platforms.quix.config - + -### TimestampType +#### strip\_workspace\_id\_prefix ```python -class TimestampType(enum.IntEnum) +def strip_workspace_id_prefix(workspace_id: str, s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/timestamps.py#L8) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L46) - +Remove the workspace ID from a given string if it starts with it. -#### TIMESTAMP\_NOT\_AVAILABLE +Only used for consumer groups. -timestamps not supported by broker +**Arguments**: - +- `workspace_id`: the workspace id +- `s`: the string to append to -#### TIMESTAMP\_CREATE\_TIME +**Returns**: -message creation time (or source / producer time) +the string with workspace_id prefix removed - + -#### TIMESTAMP\_LOG\_APPEND\_TIME +#### prepend\_workspace\_id -broker receive time +```python +def prepend_workspace_id(workspace_id: str, s: str) -> str +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L59) -### MessageTimestamp +Add the workspace ID as a prefix to a given string if it does not have it. -```python -class MessageTimestamp() -``` +Only used for consumer groups. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/timestamps.py#L14) +**Arguments**: -Represents a timestamp of incoming Kafka message. +- `workspace_id`: the workspace id +- `s`: the string to append to -It is made pseudo-immutable (i.e. public attributes don't have setters), and -it should not be mutated during message processing. +**Returns**: - +the string with workspace_id prepended -#### MessageTimestamp.create + + +### QuixApplicationConfig ```python -@classmethod -def create(cls, timestamp_type: int, milliseconds: int) -> Self +@dataclasses.dataclass +class QuixApplicationConfig() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/timestamps.py#L41) - -Create a Timestamp object based on data - -from `confluent_kafka.Message.timestamp()`. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L73) -If timestamp type is "TIMESTAMP_NOT_AVAILABLE", the milliseconds are set to None +A convenience container class for Quix Application configs. -**Arguments**: + -- `timestamp_type`: a timestamp type represented as a number -Can be one of: -- "0" - TIMESTAMP_NOT_AVAILABLE, timestamps not supported by broker. -- "1" - TIMESTAMP_CREATE_TIME, message creation time (or source / producer time). -- "2" - TIMESTAMP_LOG_APPEND_TIME, broker receive time. -- `milliseconds`: the number of milliseconds since the epoch (UTC). +### QuixKafkaConfigsBuilder -**Returns**: +```python +class QuixKafkaConfigsBuilder() +``` -Timestamp object +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L83) - +Retrieves all the necessary information from the Quix API and builds all the +objects required to connect a confluent-kafka client to the Quix Platform. -## quixstreams.models +If not executed within the Quix platform directly, you must provide a Quix +"streaming" (aka "sdk") token, or Personal Access Token. - +Ideally you also know your workspace name or id. If not, you can search for it +using a known topic name, but note the search space is limited to the access level +of your token. -## quixstreams.models.messagecontext +It also currently handles the app_auto_create_topics setting for Quix Applications. - + -### MessageContext +#### QuixKafkaConfigsBuilder.\_\_init\_\_ ```python -class MessageContext() +def __init__(quix_sdk_token: Optional[str] = None, + workspace_id: Optional[str] = None, + quix_portal_api_service: Optional[QuixPortalApiService] = None, + timeout: float = 30, + topic_create_timeout: float = 60) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/messagecontext.py#L4) - -An object with Kafka message properties. - -It is made pseudo-immutable (i.e. public attributes don't have setters), and -it should not be mutated during message processing. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L99) - +**Arguments**: -## quixstreams.models.types +- `quix_portal_api_service`: A QuixPortalApiService instance (else generated) +- `workspace_id`: A valid Quix Workspace ID (else searched for) - + -### ConfluentKafkaMessageProto +#### QuixKafkaConfigsBuilder.convert\_topic\_response ```python -class ConfluentKafkaMessageProto(Protocol) +@classmethod +def convert_topic_response(cls, api_response: dict) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/types.py#L16) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L178) -An interface of `confluent_kafka.Message`. +Converts a GET or POST ("create") topic API response to a Topic object -Use it to not depend on exact implementation and simplify testing. +**Arguments**: -Instances of `confluent_kafka.Message` cannot be directly created from Python, -see https://github.com/confluentinc/confluent-kafka-python/issues/1535. +- `api_response`: the dict response from a get or create topic call - +**Returns**: -## quixstreams.models.serializers.avro +a corresponding Topic object - + -### AvroSerializer +#### QuixKafkaConfigsBuilder.strip\_workspace\_id\_prefix ```python -class AvroSerializer(Serializer) +def strip_workspace_id_prefix(s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L206) - +Remove the workspace ID from a given string if it starts with it. -#### AvroSerializer.\_\_init\_\_ +Only used for consumer groups. + +**Arguments**: + +- `s`: the string to append to + +**Returns**: + +the string with workspace_id prefix removed + + + +#### QuixKafkaConfigsBuilder.prepend\_workspace\_id ```python -def __init__( - schema: Schema, - strict: bool = False, - strict_allow_default: bool = False, - disable_tuple_notation: bool = False, - schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, - schema_registry_serialization_config: Optional[ - SchemaRegistrySerializationConfig] = None) +def prepend_workspace_id(s: str) -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L27) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L217) -Serializer that returns data in Avro format. +Add the workspace ID as a prefix to a given string if it does not have it. -For more information see fastavro [schemaless_writer](https://fastavro.readthedocs.io/en/latest/writer.html#fastavro._write_py.schemaless_writer) method. +Only used for consumer groups. **Arguments**: -- `schema`: The avro schema. -- `strict`: If set to True, an error will be raised if records do not contain exactly the same fields that the schema states. -Default - `False` -- `strict_allow_default`: If set to True, an error will be raised if records do not contain exactly the same fields that the schema states unless it is a missing field that has a default value in the schema. -Default - `False` -- `disable_tuple_notation`: If set to True, tuples will not be treated as a special case. Therefore, using a tuple to indicate the type of a record will not work. -Default - `False` -- `schema_registry_client_config`: If provided, serialization is offloaded to Confluent's AvroSerializer. -Default - `None` -- `schema_registry_serialization_config`: Additional configuration for Confluent's AvroSerializer. -Default - `None` ->***NOTE:*** `schema_registry_client_config` must also be set. - - - -### AvroDeserializer +- `s`: the string to append to -```python -class AvroDeserializer(Deserializer) -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L112) +the string with workspace_id prepended - + -#### AvroDeserializer.\_\_init\_\_ +#### QuixKafkaConfigsBuilder.search\_for\_workspace ```python -def __init__( - schema: Optional[Schema] = None, - reader_schema: Optional[Schema] = None, - return_record_name: bool = False, - return_record_name_override: bool = False, - return_named_type: bool = False, - return_named_type_override: bool = False, - handle_unicode_errors: str = "strict", - schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None -) +def search_for_workspace(workspace_name_or_id: Optional[str] = None, + timeout: Optional[float] = None) -> Optional[dict] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L113) - -Deserializer that parses data from Avro. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L228) -For more information see fastavro [schemaless_reader](https://fastavro.readthedocs.io/en/latest/reader.html#fastavro._read_py.schemaless_reader) method. +Search for a workspace given an expected workspace name or id. **Arguments**: -- `schema`: The Avro schema. -- `reader_schema`: If the schema has changed since being written then the new schema can be given to allow for schema migration. -Default - `None` -- `return_record_name`: If true, when reading a union of records, the result will be a tuple where the first value is the name of the record and the second value is the record itself. -Default - `False` -- `return_record_name_override`: If true, this will modify the behavior of return_record_name so that the record name is only returned for unions where there is more than one record. For unions that only have one record, this option will make it so that the record is returned by itself, not a tuple with the name. -Default - `False` -- `return_named_type`: If true, when reading a union of named types, the result will be a tuple where the first value is the name of the type and the second value is the record itself NOTE: Using this option will ignore return_record_name and return_record_name_override. -Default - `False` -- `return_named_type_override`: If true, this will modify the behavior of return_named_type so that the named type is only returned for unions where there is more than one named type. For unions that only have one named type, this option will make it so that the named type is returned by itself, not a tuple with the name. -Default - `False` -- `handle_unicode_errors`: Should be set to a valid string that can be used in the errors argument of the string decode() function. -Default - `"strict"` -- `schema_registry_client_config`: If provided, deserialization is offloaded to Confluent's AvroDeserializer. -Default - `None` +- `workspace_name_or_id`: the expected name or id of a workspace +- `timeout`: response timeout (seconds); Default 30 - +**Returns**: -## quixstreams.models.serializers.schema\_registry +the workspace data dict if search success, else None - + -### SchemaRegistryClientConfig +#### QuixKafkaConfigsBuilder.get\_workspace\_info ```python -class SchemaRegistryClientConfig(BaseSettings) +def get_workspace_info(known_workspace_topic: Optional[str] = None, + timeout: Optional[float] = None) -> dict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/schema_registry.py#L22) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L270) -Configuration required to establish the connection with a Schema Registry. +Queries for workspace data from the Quix API, regardless of instance cache, + +and updates instance attributes from query result. **Arguments**: -- `url`: Schema Registry URL. -- `ssl_ca_location`: Path to CA certificate file used to verify the -Schema Registry's private key. -- `ssl_key_location`: Path to the client's private key (PEM) used for -authentication. ->***NOTE:*** `ssl_certificate_location` must also be set. -- `ssl_certificate_location`: Path to the client's public key (PEM) used -for authentication. ->***NOTE:*** May be set without `ssl_key_location` if the private key is -stored within the PEM as well. -- `basic_auth_user_info`: Client HTTP credentials in the form of -`username:password`. ->***NOTE:*** By default, userinfo is extracted from the URL if present. +- `known_workspace_topic`: a topic you know to exist in some workspace +- `timeout`: response timeout (seconds); Default 30 - + -### SchemaRegistrySerializationConfig +#### QuixKafkaConfigsBuilder.search\_workspace\_for\_topic ```python -class SchemaRegistrySerializationConfig(BaseSettings) +def search_workspace_for_topic( + workspace_id: str, + topic: str, + timeout: Optional[float] = None) -> Optional[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/schema_registry.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L298) -Configuration that instructs Serializer how to handle communication with a +Search through all the topics in the given workspace id to see if there is a -Schema Registry. +match with the provided topic. **Arguments**: -- `auto_register_schemas`: If True, automatically register the configured schema -with Confluent Schema Registry if it has not previously been associated with the -relevant subject (determined via subject.name.strategy). Defaults to True. -- `normalize_schemas`: Whether to normalize schemas, which will transform schemas -to have a consistent format, including ordering properties and references. -- `use_latest_version`: Whether to use the latest subject version for serialization. ->***NOTE:*** There is no check that the latest schema is backwards compatible with the -object being serialized. Defaults to False. -- `subject_name_strategy`: Callable(SerializationContext, str) -> str -Defines how Schema Registry subject names are constructed. Standard naming -strategies are defined in the confluent_kafka.schema_registry namespace. -Defaults to topic_subject_name_strategy. -- `skip_known_types`: Whether or not to skip known types when resolving -schema dependencies. Defaults to False. -- `reference_subject_name_strategy`: Defines how Schema Registry subject names -for schema references are constructed. Defaults to reference_subject_name_strategy. -- `use_deprecated_format`: Specifies whether the Protobuf serializer should -serialize message indexes without zig-zag encoding. This option must be explicitly -configured as older and newer Protobuf producers are incompatible. -If the consumers of the topic being produced to are using confluent-kafka-python <1.8, -then this property must be set to True until all old consumers have been upgraded. - - - -## quixstreams.models.serializers +- `workspace_id`: the workspace to search in +- `topic`: the topic to search for +- `timeout`: response timeout (seconds); Default 30 - +**Returns**: -## quixstreams.models.serializers.exceptions +the workspace_id if success, else None - + -### IgnoreMessage +#### QuixKafkaConfigsBuilder.search\_for\_topic\_workspace ```python -class IgnoreMessage(exceptions.QuixException) +def search_for_topic_workspace(topic: str, + timeout: Optional[float] = None + ) -> Optional[dict] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/exceptions.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L319) -Raise this exception from Deserializer.__call__ in order to ignore the processing -of the particular message. +Find what workspace a topic belongs to. - +If there is only one workspace altogether, it is assumed to be the workspace. +More than one means each workspace will be searched until the first hit. -## quixstreams.models.serializers.quix +**Arguments**: - +- `topic`: the topic to search for +- `timeout`: response timeout (seconds); Default 30 -### QuixDeserializer +**Returns**: + +workspace data dict if topic search success, else None + + + +#### QuixKafkaConfigsBuilder.create\_topic ```python -class QuixDeserializer(JSONDeserializer) +def create_topic(topic: Topic, timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L76) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L348) -Handles Deserialization for any Quix-formatted topic. +The actual API call to create the topic. -Parses JSON data from either `TimeseriesData` and `EventData` (ignores the rest). +**Arguments**: - +- `topic`: a Topic instance +- `timeout`: response timeout (seconds); Default 30 -#### QuixDeserializer.\_\_init\_\_ + + +#### QuixKafkaConfigsBuilder.get\_or\_create\_topic ```python -def __init__(loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) +def get_or_create_topic(topic: Topic, timeout: Optional[float] = None) -> dict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L378) + +Get or create topics in a Quix cluster as part of initializing the Topic + +object to obtain the true topic name. **Arguments**: -- `loads`: function to parse json from bytes. -Default - :py:func:`quixstreams.utils.json.loads`. +- `topic`: a `Topic` object +- `timeout`: response timeout (seconds); Default 30 +marked as "Ready" (and thus ready to produce to/consume from). - + -#### QuixDeserializer.split\_values +#### QuixKafkaConfigsBuilder.wait\_for\_topic\_ready\_statuses ```python -@property -def split_values() -> bool +def wait_for_topic_ready_statuses(topics: List[Topic], + timeout: Optional[float] = None, + finalize_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L404) -Each Quix message might contain data for multiple Rows. -This property informs the downstream processors about that, so they can -expect an Iterable instead of Mapping. +After the broker acknowledges topics for creation, they will be in a - +"Creating" status; they not usable until they are set to a status of "Ready". -#### QuixDeserializer.deserialize +This blocks until all topics are marked as "Ready" or the timeout is hit. + +**Arguments**: + +- `topics`: a list of `Topic` objects +- `timeout`: response timeout (seconds); Default 30 +- `finalize_timeout`: topic finalization timeout (seconds); Default 60 +marked as "Ready" (and thus ready to produce to/consume from). + + + +#### QuixKafkaConfigsBuilder.get\_topic ```python -def deserialize(model_key: str, value: Union[List[Mapping], - Mapping]) -> Iterable[Mapping] +def get_topic(topic_name: str, timeout: Optional[float] = None) -> dict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L153) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L447) -Deserialization function for particular data types (Timeseries or EventData). +return the topic ID (the actual cluster topic name) if it exists, else raise **Arguments**: -- `model_key`: value of "__Q_ModelKey" message header -- `value`: deserialized JSON value of the message, list or dict +- `topic_name`: name of the topic +- `timeout`: response timeout (seconds); Default 30 + +**Raises**: + +- `QuixApiRequestFailure`: when topic does not exist **Returns**: -Iterable of dicts +response dict of the topic info if topic found, else None - + -### QuixSerializer +#### QuixKafkaConfigsBuilder.get\_application\_config ```python -class QuixSerializer(JSONSerializer) +def get_application_config(consumer_group_id: str) -> QuixApplicationConfig ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L274) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/config.py#L479) - +Get all the necessary attributes for an Application to run on Quix Cloud. -#### QuixSerializer.\_\_init\_\_ +**Arguments**: -```python -def __init__(as_legacy: bool = True, - dumps: Callable[[Any], Union[str, bytes]] = default_dumps) -``` +- `consumer_group_id`: consumer group id, if needed -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L278) +**Returns**: -Serializer that returns data in json format. +a QuixApplicationConfig instance -**Arguments**: + -- `as_legacy`: parse as the legacy format; Default = True -- `dumps`: a function to serialize objects to json. -Default - :py:func:`quixstreams.utils.json.dumps` +## quixstreams.platforms.quix.topic\_manager - + -### QuixTimeseriesSerializer +### QuixTopicManager ```python -class QuixTimeseriesSerializer(QuixSerializer) +class QuixTopicManager(TopicManager) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L321) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/topic_manager.py#L10) -Serialize data to JSON formatted according to Quix Timeseries format. +The source of all topic management with quixstreams. -The serializable object must be dictionary, and each item must be of `str`, `int`, -`float`, `bytes` or `bytearray` type. -Otherwise, the `SerializationError` will be raised. +This is specifically for Applications using the Quix Cloud. -Input: -```python -{'a': 1, 'b': 1.1, 'c': "string", 'd': b'bytes', 'Tags': {'tag1': 'tag'}} -``` +Generally initialized and managed automatically by a Quix Application, +but allows a user to work with it directly when needed, such as using it alongside +a plain `Producer` to create its topics. -Output: -```json -{ - "Timestamps": [123123123], - "NumericValues": {"a": [1], "b": [1.1]}, - "StringValues": {"c": ["string"]}, - "BinaryValues": {"d": ["Ynl0ZXM="]}, - "TagValues": {"tag1": ["tag"]} -} -``` +See methods for details. - + -### QuixEventsSerializer +#### QuixTopicManager.\_\_init\_\_ ```python -class QuixEventsSerializer(QuixSerializer) +def __init__(topic_admin: TopicAdmin, + consumer_group: str, + quix_config_builder: QuixKafkaConfigsBuilder, + timeout: float = 30, + create_timeout: float = 60, + auto_create_topics: bool = True) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/platforms/quix/topic_manager.py#L31) -Serialize data to JSON formatted according to Quix EventData format. -The input value is expected to be a dictionary with the following keys: - - "Id" (type `str`, default - "") - - "Value" (type `str`, default - ""), - - "Tags" (type `dict`, default - {}) +**Arguments**: ->***NOTE:*** All the other fields will be ignored. +- `topic_admin`: an `Admin` instance +- `quix_config_builder`: A QuixKafkaConfigsBuilder instance, else one is +generated for you. +- `timeout`: response timeout (seconds) +- `create_timeout`: timeout for topic creation -Input: -```python -{ - "Id": "an_event", - "Value": "any_string", - "Tags": {"tag1": "tag"}} -} -``` + -Output: -```json -{ - "Id": "an_event", - "Value": "any_string", - "Tags": {"tag1": "tag"}}, - "Timestamp":1692703362840389000 -} -``` +## quixstreams.state.rocksdb.windowed - + -## quixstreams.models.serializers.simple\_types +## quixstreams.state.rocksdb.windowed.serialization - + -### BytesDeserializer +#### parse\_window\_key ```python -class BytesDeserializer(Deserializer) +def parse_window_key(key: bytes) -> Tuple[bytes, int, int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L56) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/serialization.py#L21) -A deserializer to bypass bytes without any changes +Parse the window key from Rocksdb into (message_key, start, end) structure. - +Expected window key format: +|| -### BytesSerializer +**Arguments**: -```python -class BytesSerializer(Serializer) -``` +- `key`: a key from Rocksdb -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L65) +**Returns**: -A serializer to bypass bytes without any changes +a tuple with message key, start timestamp, end timestamp - + -### StringDeserializer +#### encode\_window\_key ```python -class StringDeserializer(Deserializer) +def encode_window_key(start_ms: int, end_ms: int) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L74) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/serialization.py#L41) - +Encode window start and end timestamps into bytes of the following format: -#### StringDeserializer.\_\_init\_\_ +```|``` -```python -def __init__(codec: str = "utf_8") -``` +Encoding window keys this way make them sortable in RocksDB within the same prefix. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L75) +**Arguments**: -Deserializes bytes to strings using the specified encoding. +- `start_ms`: window start in milliseconds +- `end_ms`: window end in milliseconds -**Arguments**: +**Returns**: -- `codec`: string encoding -A wrapper around `confluent_kafka.serialization.StringDeserializer`. +window timestamps as bytes - + -### IntegerDeserializer +#### encode\_window\_prefix ```python -class IntegerDeserializer(Deserializer) +def encode_window_prefix(prefix: bytes, start_ms: int) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/serialization.py#L55) -Deserializes bytes to integers. +Encode window prefix and start time to iterate over keys in RocksDB -A wrapper around `confluent_kafka.serialization.IntegerDeserializer`. +Format: +```|``` - +**Arguments**: -### DoubleDeserializer +- `prefix`: transaction prefix +- `start_ms`: window start time in milliseconds -```python -class DoubleDeserializer(Deserializer) -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L111) +bytes -Deserializes float to IEEE 764 binary64. + -A wrapper around `confluent_kafka.serialization.DoubleDeserializer`. +## quixstreams.state.rocksdb.windowed.metadata - + -### StringSerializer +## quixstreams.state.rocksdb.windowed.store + + + +### WindowedRocksDBStore ```python -class StringSerializer(Serializer) +class WindowedRocksDBStore(RocksDBStore) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L129) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/store.py#L9) - +RocksDB-based windowed state store. -#### StringSerializer.\_\_init\_\_ +It keeps track of individual store partitions and provides access to the +partitions' transactions. + + + +#### WindowedRocksDBStore.\_\_init\_\_ ```python -def __init__(codec: str = "utf_8") +def __init__( + name: str, + topic: str, + base_dir: str, + changelog_producer_factory: Optional[ChangelogProducerFactory] = None, + options: Optional[RocksDBOptionsType] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L130) - -Serializes strings to bytes using the specified encoding. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/store.py#L17) **Arguments**: -- `codec`: string encoding +- `name`: a unique store name +- `topic`: a topic name for this store +- `base_dir`: path to a directory with the state +- `changelog_producer_factory`: a ChangelogProducerFactory instance +if using changelogs +- `options`: RocksDB options. If `None`, the default options will be used. - + -### IntegerSerializer +## quixstreams.state.rocksdb.windowed.partition + + + +### WindowedRocksDBStorePartition ```python -class IntegerSerializer(Serializer) +class WindowedRocksDBStorePartition(RocksDBStorePartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L142) - -Serializes integers to bytes +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/partition.py#L21) - +A base class to access windowed state in RocksDB. -### DoubleSerializer +It represents a single RocksDB database. -```python -class DoubleSerializer(Serializer) -``` +Besides the data, it keeps track of the latest observed timestamp and +stores the expiration index to delete expired windows. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L155) +**Arguments**: -Serializes floats to bytes +- `path`: an absolute path to the RocksDB folder +- `options`: RocksDB options. If `None`, the default options will be used. - + -## quixstreams.models.serializers.protobuf +## quixstreams.state.rocksdb.windowed.state - + -### ProtobufSerializer +### WindowedTransactionState ```python -class ProtobufSerializer(Serializer) +class WindowedTransactionState(WindowedState) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L9) - + -#### ProtobufSerializer.\_\_init\_\_ +#### WindowedTransactionState.\_\_init\_\_ ```python -def __init__( - msg_type: Message, - deterministic: bool = False, - ignore_unknown_fields: bool = False, - schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, - schema_registry_serialization_config: Optional[ - SchemaRegistrySerializationConfig] = None) +def __init__(transaction: "WindowedRocksDBPartitionTransaction", + prefix: bytes) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L25) - -Serializer that returns data in protobuf format. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L12) -Serialisation from a python dictionary can have a significant performance impact. An alternative is to pass the serializer an object of the `msg_type` class. +A windowed state to be provided into `StreamingDataFrame` window functions. **Arguments**: -- `msg_type`: protobuf message class. -- `deterministic`: If true, requests deterministic serialization of the protobuf, with predictable ordering of map keys -Default - `False` -- `ignore_unknown_fields`: If True, do not raise errors for unknown fields. -Default - `False` -- `schema_registry_client_config`: If provided, serialization is offloaded to Confluent's ProtobufSerializer. -Default - `None` -- `schema_registry_serialization_config`: Additional configuration for Confluent's ProtobufSerializer. -Default - `None` ->***NOTE:*** `schema_registry_client_config` must also be set. +- `transaction`: instance of `WindowedRocksDBPartitionTransaction` - + -### ProtobufDeserializer +#### WindowedTransactionState.get\_window ```python -class ProtobufDeserializer(Deserializer) +def get_window(start_ms: int, + end_ms: int, + default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L110) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L23) - +Get the value of the window defined by `start` and `end` timestamps -#### ProtobufDeserializer.\_\_init\_\_ - -```python -def __init__( - msg_type: Message, - use_integers_for_enums: bool = False, - preserving_proto_field_name: bool = False, - to_dict: bool = True, - schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, - schema_registry_serialization_config: Optional[ - SchemaRegistrySerializationConfig] = None) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L111) - -Deserializer that parses protobuf data into a dictionary suitable for a StreamingDataframe. - -Deserialisation to a python dictionary can have a significant performance impact. You can disable this behavior using `to_dict`, in that case the protobuf message will be used as the StreamingDataframe row value. +if the window is present in the state, else default **Arguments**: -- `msg_type`: protobuf message class. -- `use_integers_for_enums`: If true, use integers instead of enum names. -Default - `False` -- `preserving_proto_field_name`: If True, use the original proto field names as -defined in the .proto file. If False, convert the field names to -lowerCamelCase. -Default - `False` -- `to_dict`: If false, return the protobuf message instead of a dict. -Default - `True` -- `schema_registry_client_config`: If provided, deserialization is offloaded to Confluent's ProtobufDeserializer. -Default - `None` -- `schema_registry_serialization_config`: Additional configuration for Confluent's ProtobufDeserializer. -Default - `None` ->***NOTE:*** `schema_registry_client_config` must also be set. +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `default`: default value to return if the key is not found - +**Returns**: -## quixstreams.models.serializers.json +value or None if the key is not found and `default` is not provided - + -### JSONSerializer +#### WindowedTransactionState.update\_window ```python -class JSONSerializer(Serializer) +def update_window(start_ms: int, + end_ms: int, + value: Any, + timestamp_ms: int, + window_timestamp_ms: Optional[int] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L32) - - - -#### JSONSerializer.\_\_init\_\_ - -```python -def __init__( - dumps: Callable[[Any], Union[str, bytes]] = default_dumps, - schema: Optional[Mapping] = None, - validator: Optional[Validator] = None, - schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None, - schema_registry_serialization_config: Optional[ - SchemaRegistrySerializationConfig] = None) -``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L39) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L33) +Set a value for the window. -Serializer that returns data in json format. +This method will also update the latest observed timestamp in state partition +using the provided `timestamp`. **Arguments**: -- `dumps`: a function to serialize objects to json. -Default - :py:func:`quixstreams.utils.json.dumps` -- `schema`: A schema used to validate the data using [`jsonschema.Draft202012Validator`](https://python-jsonschema.readthedocs.io/en/stable/api/jsonschema/validators/`jsonschema.validators.Draft202012Validator`). -Default - `None` -- `validator`: A jsonschema validator used to validate the data. Takes precedences over the schema. -Default - `None` -- `schema_registry_client_config`: If provided, serialization is offloaded to Confluent's JSONSerializer. -Default - `None` -- `schema_registry_serialization_config`: Additional configuration for Confluent's JSONSerializer. -Default - `None` ->***NOTE:*** `schema_registry_client_config` must also be set. +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `value`: value of the window +- `timestamp_ms`: current message timestamp in milliseconds +- `window_timestamp_ms`: arbitrary timestamp stored with the window value - + -### JSONDeserializer +#### WindowedTransactionState.get\_latest\_timestamp ```python -class JSONDeserializer(Deserializer) +def get_latest_timestamp() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L119) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L69) - +Get the latest observed timestamp for the current message key. -#### JSONDeserializer.\_\_init\_\_ +Use this timestamp to determine if the arriving event is late and should be +discarded from the processing. + +**Returns**: + +latest observed event timestamp in milliseconds + + + +#### WindowedTransactionState.expire\_windows ```python -def __init__( - loads: Callable[[Union[bytes, bytearray]], Any] = default_loads, - schema: Optional[Mapping] = None, - validator: Optional[Validator] = None, - schema_registry_client_config: Optional[SchemaRegistryClientConfig] = None -) +def expire_windows(max_start_time: int, + delete: bool = True) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L120) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L81) -Deserializer that parses data from JSON +Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. + +This method marks the latest found window as expired in the expiration index, +so consecutive calls may yield different results for the same "latest timestamp". **Arguments**: -- `loads`: function to parse json from bytes. -Default - :py:func:`quixstreams.utils.json.loads`. -- `schema`: A schema used to validate the data using [`jsonschema.Draft202012Validator`](https://python-jsonschema.readthedocs.io/en/stable/api/jsonschema/validators/`jsonschema.validators.Draft202012Validator`). -Default - `None` -- `validator`: A jsonschema validator used to validate the data. Takes precedences over the schema. -Default - `None` -- `schema_registry_client_config`: If provided, deserialization is offloaded to Confluent's JSONDeserializer. -Default - `None` +- `max_start_time`: The timestamp up to which windows are considered expired, inclusive. +- `delete`: If True, expired windows will be deleted. - +**Returns**: -## quixstreams.models.serializers.base +A sorted list of tuples in the format `((start, end), value)`. - + -### SerializationContext +#### WindowedTransactionState.get\_windows ```python -class SerializationContext(_SerializationContext) +def get_windows(start_from_ms: int, + start_to_ms: int, + backwards: bool = False) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/base.py#L24) - -Provides additional context for message serialization/deserialization. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L98) -Every `Serializer` and `Deserializer` receives an instance of `SerializationContext` +Get all windows that start between "start_from_ms" and "start_to_ms". - +**Arguments**: -### Deserializer +- `start_from_ms`: The minimal window start time, exclusive. +- `start_to_ms`: The maximum window start time, inclusive. +- `backwards`: If True, yields windows in reverse order. -```python -class Deserializer(abc.ABC) -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/base.py#L44) +A sorted list of tuples in the format `((start, end), value)`. - + -#### Deserializer.\_\_init\_\_ +#### WindowedTransactionState.delete\_windows ```python -def __init__(*args, **kwargs) +def delete_windows(max_start_time: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/base.py#L45) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/state.py#L116) -A base class for all Deserializers +Delete windows from RocksDB up to the specified `max_start_time` timestamp. - +This method removes all window entries that have a start time less than or equal to the given +`max_start_time`. It ensures that expired data is cleaned up efficiently without affecting +unexpired windows. -#### Deserializer.split\_values +**Arguments**: -```python -@property -def split_values() -> bool -``` +- `max_start_time`: The timestamp up to which windows should be deleted, inclusive. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/base.py#L51) + -Return True if the deserialized message should be considered as Iterable -and each item in it should be processed as a separate message. +## quixstreams.state.rocksdb.windowed.transaction - + -### Serializer +### WindowedRocksDBPartitionTransaction ```python -class Serializer(abc.ABC) +class WindowedRocksDBPartitionTransaction(PartitionTransaction) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/base.py#L62) - -A base class for all Serializers +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/transaction.py#L34) - + -#### Serializer.extra\_headers +#### WindowedRocksDBPartitionTransaction.expire\_windows ```python -@property -def extra_headers() -> HeadersMapping +def expire_windows(max_start_time: int, + prefix: bytes, + delete: bool = True) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/base.py#L68) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/transaction.py#L128) -Informs producer to set additional headers +Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. -for the message it will be serializing +This method marks the latest found window as expired in the expiration index, +so consecutive calls may yield different results for the same "latest timestamp". -Must return a dictionary with headers. -Keys must be strings, and values must be strings, bytes or None. +How it works: +- First, it checks the expiration cache for the start time of the last expired + window for the current prefix. If found, this value helps reduce the search + space and prevents returning previously expired windows. +- Next, it iterates over window segments and identifies the windows that should + be marked as expired. +- Finally, it updates the expiration cache with the start time of the latest + windows found. + +**Arguments**: + +- `max_start_time`: The timestamp up to which windows are considered expired, inclusive. +- `prefix`: The key prefix for filtering windows. +- `delete`: If True, expired windows will be deleted. **Returns**: -dict with headers +A sorted list of tuples in the format `((start, end), value)`. - + -## quixstreams.models.messages +#### WindowedRocksDBPartitionTransaction.delete\_windows - +```python +def delete_windows(max_start_time: int, prefix: bytes) -> None +``` -## quixstreams.models.rows +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/transaction.py#L185) - +Delete windows from RocksDB up to the specified `max_start_time` timestamp. -## quixstreams.models.topics +This method removes all window entries that have a start time less than or equal to the given +`max_start_time`. It ensures that expired data is cleaned up efficiently without affecting +unexpired windows. - +How it works: +- It retrieves the start time of the last deleted window for the given prefix from the +deletion index. This minimizes redundant scans over already deleted windows. +- It iterates over the windows starting from the last deleted timestamp up to the `max_start_time`. +- Each window within this range is deleted from the database. +- After deletion, it updates the deletion index with the start time of the latest window +that was deleted to keep track of progress. -## quixstreams.models.topics.admin +**Arguments**: - +- `max_start_time`: The timestamp up to which windows should be deleted, inclusive. +- `prefix`: The key prefix used to identify and filter relevant windows. -#### convert\_topic\_list + + +#### WindowedRocksDBPartitionTransaction.get\_windows ```python -def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic] +def get_windows(start_from_ms: int, + start_to_ms: int, + prefix: bytes, + backwards: bool = False) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/windowed/transaction.py#L232) -Converts `Topic`s to `ConfluentTopic`s as required for Confluent's +Get all windows that start between "start_from_ms" and "start_to_ms" -`AdminClient.create_topic()`. +within the specified prefix. + +This function also checks the update cache for any updates not yet +committed to RocksDB. **Arguments**: -- `topics`: list of `Topic`s +- `start_from_ms`: The minimal window start time, exclusive. +- `start_to_ms`: The maximum window start time, inclusive. +- `prefix`: The key prefix for filtering windows. +- `backwards`: If True, yields windows in reverse order. **Returns**: -list of confluent_kafka `ConfluentTopic`s +A sorted list of tuples in the format `((start, end), value)`. - + -### TopicAdmin +## quixstreams.state.rocksdb -```python -class TopicAdmin() -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L52) +## quixstreams.state.rocksdb.exceptions -For performing "admin"-level operations on a Kafka cluster, mostly around topics. + -Primarily used to create and inspect topic configurations. +## quixstreams.state.rocksdb.metadata - + -#### TopicAdmin.\_\_init\_\_ +## quixstreams.state.rocksdb.options + + + +### RocksDBOptions ```python -def __init__(broker_address: Union[str, ConnectionConfig], - logger: logging.Logger = logger, - extra_config: Optional[Mapping] = None) +@dataclasses.dataclass(frozen=True) +class RocksDBOptions(RocksDBOptionsType) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L59) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/options.py#L26) + +RocksDB database options. **Arguments**: -- `broker_address`: Connection settings for Kafka. -Accepts string with Kafka broker host and port formatted as `:`, -or a ConnectionConfig object if authentication is required. -- `logger`: a Logger instance to attach librdkafka logging to -- `extra_config`: optional configs (generally accepts producer configs) +- `dumps`: function to dump data to JSON +- `loads`: function to load data from JSON +- `open_max_retries`: number of times to retry opening the database +if it's locked by another process. To disable retrying, pass 0 +- `open_retry_backoff`: number of seconds to wait between each retry. +Please see `rocksdict.Options` for a complete description of other options. - + -#### TopicAdmin.list\_topics +#### RocksDBOptions.to\_options ```python -def list_topics(timeout: float = -1) -> Dict[str, ConfluentTopicMetadata] +def to_options() -> rocksdict.Options ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L91) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/options.py#L54) -Get a list of topics and their metadata from a Kafka cluster +Convert parameters to `rocksdict.Options` -**Arguments**: +**Returns**: -- `timeout`: response timeout (seconds); Default infinite (-1) +instance of `rocksdict.Options` -**Returns**: + -a dict of topic names and their metadata objects +## quixstreams.state.rocksdb.types - + -#### TopicAdmin.inspect\_topics +## quixstreams.state.rocksdb.partition + + + +### RocksDBStorePartition ```python -def inspect_topics(topic_names: List[str], - timeout: float = 30) -> Dict[str, Optional[TopicConfig]] +class RocksDBStorePartition(StorePartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L102) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L30) -A simplified way of getting the topic configurations of the provided topics +A base class to access state in RocksDB. -from the cluster (if they exist). +It represents a single RocksDB database. -**Arguments**: +Responsibilities: + 1. Managing access to the RocksDB instance + 2. Creating transactions to interact with data + 3. Flushing WriteBatches to the RocksDB -- `topic_names`: a list of topic names -- `timeout`: response timeout (seconds) ->***NOTE***: `timeout` must be >0 here (expects non-neg, and 0 != inf). +It opens the RocksDB on `__init__`. If the db is locked by another process, +it will retry according to `open_max_retries` and `open_retry_backoff` options. -**Returns**: +**Arguments**: -a dict with topic names and their respective `TopicConfig` +- `path`: an absolute path to the RocksDB folder +- `options`: RocksDB options. If `None`, the default options will be used. - + -#### TopicAdmin.create\_topics +#### RocksDBStorePartition.write ```python -def create_topics(topics: List[Topic], - timeout: float = 30, - finalize_timeout: float = 60) +def write(cache: PartitionTransactionCache, + processed_offset: Optional[int], + changelog_offset: Optional[int], + batch: Optional[WriteBatch] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L184) - -Create the given list of topics and confirm they are ready. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L115) -Also raises an exception with detailed printout should the creation -fail (it ignores issues for a topic already existing). +Write data to RocksDB **Arguments**: -- `topics`: a list of `Topic` -- `timeout`: creation acknowledge timeout (seconds) -- `finalize_timeout`: topic finalization timeout (seconds) ->***NOTE***: `timeout` must be >0 here (expects non-neg, and 0 != inf). - - - -## quixstreams.models.topics.utils +- `cache`: The modified data +- `processed_offset`: The offset processed to generate the data. +- `changelog_offset`: The changelog message offset of the data. +- `batch`: prefilled `rocksdict.WriteBatch`, optional. - + -#### merge\_headers +#### RocksDBStorePartition.get ```python -def merge_headers(original: KafkaHeaders, - other: HeadersMapping) -> HeadersTuples +def get(key: bytes, + cf_name: str = "default") -> Union[bytes, Literal[Marker.UNDEFINED]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/utils.py#L10) - -Merge two sets of Kafka message headers, overwriting headers in "origin" +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L180) -by the values from "other". +Get a key from RocksDB. **Arguments**: -- `original`: original headers as a list of (key, value) tuples. -- `other`: headers to merge as a dictionary. +- `key`: a key encoded to `bytes` +- `default`: a default value to return if the key is not found. +- `cf_name`: rocksdb column family name. Default - "default" **Returns**: -a list of (key, value) tuples. - - - -## quixstreams.models.topics.topic +a value if the key is present in the DB. Otherwise, `default` - + -### TopicConfig +#### RocksDBStorePartition.exists ```python -@dataclasses.dataclass(eq=True) -class TopicConfig() +def exists(key: bytes, cf_name: str = "default") -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L42) - -Represents all kafka-level configuration for a kafka topic. - -Generally used by Topic and any topic creation procedures. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L196) -### Topic +Check if a key is present in the DB. -```python -class Topic() -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L83) +- `key`: a key encoded to `bytes`. +- `cf_name`: rocksdb column family name. Default - "default" -A definition of a Kafka topic. +**Returns**: -Typically created with an `app = quixstreams.app.Application()` instance via -`app.topic()`, and used by `quixstreams.dataframe.StreamingDataFrame` -instance. +`True` if the key is present, `False` otherwise. - + -#### Topic.\_\_init\_\_ +#### RocksDBStorePartition.get\_processed\_offset ```python -def __init__( - name: str, - config: Optional[TopicConfig] = None, - value_deserializer: Optional[DeserializerType] = None, - key_deserializer: Optional[DeserializerType] = BytesDeserializer(), - value_serializer: Optional[SerializerType] = None, - key_serializer: Optional[SerializerType] = BytesSerializer(), - timestamp_extractor: Optional[TimestampExtractor] = None) +def get_processed_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L92) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L207) -**Arguments**: +Get last processed offset for the given partition -- `name`: topic name -- `config`: topic configs via `TopicConfig` (creation/validation) -- `value_deserializer`: a deserializer type for values -- `key_deserializer`: a deserializer type for keys -- `value_serializer`: a serializer type for values -- `key_serializer`: a serializer type for keys -- `timestamp_extractor`: a callable that returns a timestamp in -milliseconds from a deserialized message. +**Returns**: - +offset or `None` if there's no processed offset yet -#### Topic.row\_serialize + + +#### RocksDBStorePartition.get\_changelog\_offset ```python -def row_serialize(row: Row, key: Any) -> KafkaMessage +def get_changelog_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L140) - -Serialize Row to a Kafka message structure - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L222) -- `row`: Row to serialize -- `key`: message key to serialize +Get offset that the changelog is up-to-date with. **Returns**: -KafkaMessage object with serialized values +offset or `None` if there's no processed offset yet - + -#### Topic.row\_deserialize +#### RocksDBStorePartition.close ```python -def row_deserialize( - message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None] +def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L180) - -Deserialize incoming Kafka message to a Row. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L234) -**Arguments**: +Close the underlying RocksDB -- `message`: an object with interface of `confluent_kafka.Message` + -**Returns**: +#### RocksDBStorePartition.path -Row, list of Rows or None if the message is ignored. +```python +@property +def path() -> str +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L247) -## quixstreams.models.topics.exceptions +Absolute path to RocksDB database folder - +**Returns**: -## quixstreams.models.topics.manager +file path - + -#### affirm\_ready\_for\_create +#### RocksDBStorePartition.destroy ```python -def affirm_ready_for_create(topics: List[Topic]) +@classmethod +def destroy(cls, path: str) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L255) -Validate a list of topics is ready for creation attempt +Delete underlying RocksDB database + +The database must be closed first. **Arguments**: -- `topics`: list of `Topic`s +- `path`: an absolute path to the RocksDB folder - + -### TopicManager +#### RocksDBStorePartition.get\_column\_family\_handle ```python -class TopicManager() +def get_column_family_handle(cf_name: str) -> ColumnFamily ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L31) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L265) -The source of all topic management for a Quix Streams Application. - -Intended only for internal use by Application. +Get a column family handle to pass to it WriteBatch. -To create a Topic, use Application.topic() or generate them directly. +This method will cache the CF handle instance to avoid creating them +repeatedly. - +**Arguments**: -#### TopicManager.\_\_init\_\_ +- `cf_name`: column family name + +**Returns**: + +instance of `rocksdict.ColumnFamily` + + + +#### RocksDBStorePartition.get\_column\_family ```python -def __init__(topic_admin: TopicAdmin, - consumer_group: str, - timeout: float = 30, - create_timeout: float = 60, - auto_create_topics: bool = True) +def get_column_family(cf_name: str) -> Rdict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L52) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/partition.py#L286) -**Arguments**: +Get a column family instance. -- `topic_admin`: an `Admin` instance (required for some functionality) -- `consumer_group`: the consumer group (of the `Application`) -- `timeout`: response timeout (seconds) -- `create_timeout`: timeout for topic creation +This method will cache the CF instance to avoid creating them repeatedly. - +**Arguments**: -#### TopicManager.changelog\_topics +- `cf_name`: column family name -```python -@property -def changelog_topics() -> Dict[Optional[str], Dict[str, Topic]] -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L104) +instance of `rocksdict.Rdict` for the given column family -Note: `Topic`s are the changelogs. + -returns: the changelog topic dict, {topic_name: {suffix: Topic}} +## quixstreams.state.rocksdb.store - + -#### TopicManager.all\_topics +### RocksDBStore ```python -@property -def all_topics() -> Dict[str, Topic] +class RocksDBStore(Store) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/store.py#L18) -Every registered topic name mapped to its respective `Topic`. +RocksDB-based state store. -returns: full topic dict, {topic_name: Topic} +It keeps track of individual store partitions and provides access to the +partitions' transactions. - + -#### TopicManager.topic\_config +#### RocksDBStore.\_\_init\_\_ ```python -def topic_config(num_partitions: Optional[int] = None, - replication_factor: Optional[int] = None, - extra_config: Optional[dict] = None) -> TopicConfig +def __init__( + name: str, + topic: Optional[str], + base_dir: str, + changelog_producer_factory: Optional[ChangelogProducerFactory] = None, + options: Optional[RocksDBOptionsType] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L223) - -Convenience method for generating a `TopicConfig` with default settings +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/store.py#L26) **Arguments**: -- `num_partitions`: the number of topic partitions -- `replication_factor`: the topic replication factor -- `extra_config`: other optional configuration settings +- `name`: a unique store name +- `topic`: a topic name for this store +- `base_dir`: path to a directory with the state +- `changelog_producer_factory`: a ChangelogProducerFactory instance +if using changelogs +- `options`: RocksDB options. If `None`, the default options will be used. -**Returns**: + -a TopicConfig object +## quixstreams.state.base - + -#### TopicManager.topic +## quixstreams.state.base.state + + + +### State ```python -def topic(name: str, - value_deserializer: Optional[DeserializerType] = None, - key_deserializer: Optional[DeserializerType] = "bytes", - value_serializer: Optional[SerializerType] = None, - key_serializer: Optional[SerializerType] = "bytes", - config: Optional[TopicConfig] = None, - timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic +class State(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L13) -A convenience method for generating a `Topic`. Will use default config options +Primary interface for working with key-value state data from `StreamingDataFrame` -as dictated by the TopicManager. + + +#### State.get + +```python +@abstractmethod +def get(key: Any, default: Any = None) -> Optional[Any] +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L19) + +Get the value for key if key is present in the state, else default **Arguments**: -- `name`: topic name -- `value_deserializer`: a deserializer type for values -- `key_deserializer`: a deserializer type for keys -- `value_serializer`: a serializer type for values -- `key_serializer`: a serializer type for keys -- `config`: optional topic configurations (for creation/validation) -- `timestamp_extractor`: a callable that returns a timestamp in -milliseconds from a deserialized message. +- `key`: key +- `default`: default value to return if the key is not found **Returns**: -Topic object with creation configs +value or None if the key is not found and `default` is not provided - + -#### TopicManager.register +#### State.set ```python -def register(topic: Topic) -> Topic +@abstractmethod +def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L290) - -Register an already generated :class:`quixstreams.models.topics.Topic` to the topic manager. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L30) -The topic name and config can be updated by the topic manager. +Set value for the key. **Arguments**: -- `topic`: The topic to register +- `key`: key +- `value`: value - + -#### TopicManager.repartition\_topic +#### State.delete ```python -def repartition_topic(operation: str, - topic_name: str, - value_deserializer: Optional[DeserializerType] = "json", - key_deserializer: Optional[DeserializerType] = "json", - value_serializer: Optional[SerializerType] = "json", - key_serializer: Optional[SerializerType] = "json", - timeout: Optional[float] = None) -> Topic +@abstractmethod +def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L308) - -Create an internal repartition topic. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L39) -**Arguments**: +Delete value for the key. -- `operation`: name of the GroupBy operation (column name or user-defined). -- `topic_name`: name of the topic the GroupBy is sourced from. -- `value_deserializer`: a deserializer type for values; default - JSON -- `key_deserializer`: a deserializer type for keys; default - JSON -- `value_serializer`: a serializer type for values; default - JSON -- `key_serializer`: a serializer type for keys; default - JSON -- `timeout`: config lookup timeout (seconds); Default 30 +This function always returns `None`, even if value is not found. -**Returns**: +**Arguments**: -`Topic` object (which is also stored on the TopicManager) +- `key`: key - + -#### TopicManager.changelog\_topic +#### State.exists ```python -def changelog_topic(topic_name: Optional[str], - store_name: str, - config: Optional[TopicConfig] = None, - timeout: Optional[float] = None) -> Topic +@abstractmethod +def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L348) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L49) -Performs all the logic necessary to generate a changelog topic based on an +Check if the key exists in state. -optional "source topic" (aka input/consumed topic). +**Arguments**: -Its main goal is to ensure partition counts of the to-be generated changelog -match the source topic, and ensure the changelog topic is compacted. Also -enforces the serialization type. All `Topic` objects generated with this are -stored on the TopicManager. +- `key`: key -If source topic already exists, defers to the existing topic settings, else -uses the settings as defined by the `Topic` (and its defaults) as generated -by the `TopicManager`. +**Returns**: -In general, users should NOT need this; an Application knows when/how to -generate changelog topics. To turn off changelogs, init an Application with -"use_changelog_topics"=`False`. +True if key exists, False otherwise -**Arguments**: + -- `topic_name`: name of consumed topic (app input topic) -> NOTE: normally contain any prefixes added by TopicManager.topic() -- `store_name`: name of the store this changelog belongs to -(default, rolling10s, etc.) -- `config`: the changelog topic configuration. Default to `topic_name` configuration or TopicManager default -- `timeout`: config lookup timeout (seconds); Default 30 +### TransactionState -**Returns**: +```python +class TransactionState(State) +``` -`Topic` object (which is also stored on the TopicManager) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L58) - + -#### TopicManager.create\_topics +#### TransactionState.\_\_init\_\_ ```python -def create_topics(topics: List[Topic], - timeout: Optional[float] = None, - create_timeout: Optional[float] = None) +def __init__(prefix: bytes, transaction: "PartitionTransaction") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L416) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L64) -Creates topics via an explicit list of provided `Topics`. - -Exists as a way to manually specify what topics to create; otherwise, -`create_all_topics()` is generally simpler. +Simple key-value state to be provided into `StreamingDataFrame` functions **Arguments**: -- `topics`: list of `Topic`s -- `timeout`: creation acknowledge timeout (seconds); Default 30 -- `create_timeout`: topic finalization timeout (seconds); Default 60 +- `transaction`: instance of `PartitionTransaction` - + -#### TopicManager.create\_all\_topics +#### TransactionState.get ```python -def create_all_topics(timeout: Optional[float] = None, - create_timeout: Optional[float] = None) +def get(key: Any, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L444) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L73) -A convenience method to create all Topic objects stored on this TopicManager. - -If `auto_create_topics` is set to False no topic will be created. +Get the value for key if key is present in the state, else default **Arguments**: -- `timeout`: creation acknowledge timeout (seconds); Default 30 -- `create_timeout`: topic finalization timeout (seconds); Default 60 +- `key`: key +- `default`: default value to return if the key is not found - +**Returns**: -#### TopicManager.validate\_all\_topics +value or None if the key is not found and `default` is not provided + + + +#### TransactionState.set ```python -def validate_all_topics(timeout: Optional[float] = None) +def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L460) - -Validates all topics exist and changelogs have correct topic and rep factor. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L83) -Issues are pooled and raised as an Exception once inspections are complete. +Set value for the key. - +**Arguments**: -## quixstreams.state.rocksdb.windowed.store +- `key`: key +- `value`: value - + -### WindowedRocksDBStore +#### TransactionState.delete ```python -class WindowedRocksDBStore(RocksDBStore) +def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/store.py#L9) - -RocksDB-based windowed state store. - -It keeps track of individual store partitions and provides access to the -partitions' transactions. - - - -#### WindowedRocksDBStore.\_\_init\_\_ +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L91) -```python -def __init__( - name: str, - topic: str, - base_dir: str, - changelog_producer_factory: Optional[ChangelogProducerFactory] = None, - options: Optional[RocksDBOptionsType] = None) -``` +Delete value for the key. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/store.py#L17) +This function always returns `None`, even if value is not found. **Arguments**: -- `name`: a unique store name -- `topic`: a topic name for this store -- `base_dir`: path to a directory with the state -- `changelog_producer_factory`: a ChangelogProducerFactory instance -if using changelogs -- `options`: RocksDB options. If `None`, the default options will be used. - - - -## quixstreams.state.rocksdb.windowed.partition +- `key`: key - + -### WindowedRocksDBStorePartition +#### TransactionState.exists ```python -class WindowedRocksDBStorePartition(RocksDBStorePartition) +def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/partition.py#L21) - -A base class to access windowed state in RocksDB. - -It represents a single RocksDB database. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L100) -Besides the data, it keeps track of the latest observed timestamp and -stores the expiration index to delete expired windows. +Check if the key exists in state. **Arguments**: -- `path`: an absolute path to the RocksDB folder -- `options`: RocksDB options. If `None`, the default options will be used. +- `key`: key - +**Returns**: -## quixstreams.state.rocksdb.windowed.metadata +True if key exists, False otherwise - + -## quixstreams.state.rocksdb.windowed.transaction +## quixstreams.state.base.partition - + -### WindowedRocksDBPartitionTransaction +### StorePartition ```python -class WindowedRocksDBPartitionTransaction(PartitionTransaction) +class StorePartition(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/transaction.py#L34) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L26) - +A base class to access state in the underlying storage. +It represents a single instance of some storage (e.g. a single database for +the persistent storage). -#### WindowedRocksDBPartitionTransaction.expire\_windows + + +#### StorePartition.get\_processed\_offset ```python -def expire_windows(max_start_time: int, - prefix: bytes, - delete: bool = True) -> list[tuple[tuple[int, int], Any]] +@abstractmethod +def get_processed_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/transaction.py#L128) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L57) -Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. +Get last processed offset for the given partition -This method marks the latest found window as expired in the expiration index, -so consecutive calls may yield different results for the same "latest timestamp". +**Returns**: -How it works: -- First, it checks the expiration cache for the start time of the last expired - window for the current prefix. If found, this value helps reduce the search - space and prevents returning previously expired windows. -- Next, it iterates over window segments and identifies the windows that should - be marked as expired. -- Finally, it updates the expiration cache with the start time of the latest - windows found. +offset or `None` if there's no processed offset yet -**Arguments**: + -- `max_start_time`: The timestamp up to which windows are considered expired, inclusive. -- `prefix`: The key prefix for filtering windows. -- `delete`: If True, expired windows will be deleted. +#### StorePartition.get\_changelog\_offset + +```python +@abstractmethod +def get_changelog_offset() -> Optional[int] +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L65) + +Get offset that the changelog is up-to-date with. **Returns**: -A sorted list of tuples in the format `((start, end), value)`. +offset or `None` if there's no processed offset yet - + -#### WindowedRocksDBPartitionTransaction.delete\_windows +#### StorePartition.write ```python -def delete_windows(max_start_time: int, prefix: bytes) -> None +@abstractmethod +def write(cache: PartitionTransactionCache, processed_offset: Optional[int], + changelog_offset: Optional[int]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/transaction.py#L185) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L73) -Delete windows from RocksDB up to the specified `max_start_time` timestamp. - -This method removes all window entries that have a start time less than or equal to the given -`max_start_time`. It ensures that expired data is cleaned up efficiently without affecting -unexpired windows. - -How it works: -- It retrieves the start time of the last deleted window for the given prefix from the -deletion index. This minimizes redundant scans over already deleted windows. -- It iterates over the windows starting from the last deleted timestamp up to the `max_start_time`. -- Each window within this range is deleted from the database. -- After deletion, it updates the deletion index with the start time of the latest window -that was deleted to keep track of progress. +Update the state with data from the update cache **Arguments**: -- `max_start_time`: The timestamp up to which windows should be deleted, inclusive. -- `prefix`: The key prefix used to identify and filter relevant windows. +- `cache`: The modified data +- `processed_offset`: The offset processed to generate the data. +- `changelog_offset`: The changelog message offset of the data. - + -#### WindowedRocksDBPartitionTransaction.get\_windows +#### StorePartition.get ```python -def get_windows(start_from_ms: int, - start_to_ms: int, - prefix: bytes, - backwards: bool = False) -> list[tuple[tuple[int, int], Any]] +@abstractmethod +def get(key: bytes, + cf_name: str = "default") -> Union[bytes, Literal[Marker.UNDEFINED]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/transaction.py#L232) - -Get all windows that start between "start_from_ms" and "start_to_ms" - -within the specified prefix. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L88) -This function also checks the update cache for any updates not yet -committed to RocksDB. +Get a key from the store **Arguments**: -- `start_from_ms`: The minimal window start time, exclusive. -- `start_to_ms`: The maximum window start time, inclusive. -- `prefix`: The key prefix for filtering windows. -- `backwards`: If True, yields windows in reverse order. +- `key`: a key encoded to `bytes` +- `default`: a default value to return if the key is not found. +- `cf_name`: rocksdb column family name. Default - "default" **Returns**: -A sorted list of tuples in the format `((start, end), value)`. - - - -## quixstreams.state.rocksdb.windowed - - - -## quixstreams.state.rocksdb.windowed.serialization +a value if the key is present in the store. Otherwise, `default` - + -#### parse\_window\_key +#### StorePartition.exists ```python -def parse_window_key(key: bytes) -> Tuple[bytes, int, int] +@abstractmethod +def exists(key: bytes, cf_name: str = "default") -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/serialization.py#L21) - -Parse the window key from Rocksdb into (message_key, start, end) structure. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L101) -Expected window key format: -|| +Check if a key is present in the store. **Arguments**: -- `key`: a key from Rocksdb +- `key`: a key encoded to `bytes`. +- `cf_name`: rocksdb column family name. Default - "default" **Returns**: -a tuple with message key, start timestamp, end timestamp +`True` if the key is present, `False` otherwise. - + -#### encode\_window\_key +#### StorePartition.begin ```python -def encode_window_key(start_ms: int, end_ms: int) -> bytes +def begin() -> PartitionTransaction ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/serialization.py#L41) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L110) -Encode window start and end timestamps into bytes of the following format: +Start a new `PartitionTransaction` -```|``` +Using `PartitionTransaction` is a recommended way for accessing the data. -Encoding window keys this way make them sortable in RocksDB within the same prefix. + + +#### StorePartition.recover\_from\_changelog\_message + +```python +def recover_from_changelog_message( + changelog_message: ConfluentKafkaMessageProto, + committed_offset: int) -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/partition.py#L123) + +Updates state from a given changelog message. **Arguments**: -- `start_ms`: window start in milliseconds -- `end_ms`: window end in milliseconds +- `changelog_message`: A raw Confluent message read from a changelog topic. +- `committed_offset`: latest committed offset for the partition -**Returns**: + -window timestamps as bytes +## quixstreams.state.base.store - + -#### encode\_window\_prefix +### Store ```python -def encode_window_prefix(prefix: bytes, start_ms: int) -> bytes +class Store(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/serialization.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L15) -Encode window prefix and start time to iterate over keys in RocksDB - -Format: -```|``` +Abstract state store. -**Arguments**: +It keeps track of individual store partitions and provides access to the +partitions' transactions. -- `prefix`: transaction prefix -- `start_ms`: window start time in milliseconds + -**Returns**: +#### Store.topic -bytes +```python +@property +def topic() -> Optional[str] +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L35) -## quixstreams.state.rocksdb.windowed.state +Topic name - + -### WindowedTransactionState +#### Store.name ```python -class WindowedTransactionState(WindowedState) +@property +def name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L42) - +Store name -#### WindowedTransactionState.\_\_init\_\_ + + +#### Store.partitions ```python -def __init__(transaction: "WindowedRocksDBPartitionTransaction", - prefix: bytes) +@property +def partitions() -> Dict[int, StorePartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L49) -A windowed state to be provided into `StreamingDataFrame` window functions. +Mapping of assigned store partitions -**Arguments**: +**Returns**: -- `transaction`: instance of `WindowedRocksDBPartitionTransaction` +dict of "{partition: }" - + -#### WindowedTransactionState.get\_window +#### Store.assign\_partition ```python -def get_window(start_ms: int, - end_ms: int, - default: Any = None) -> Optional[Any] +def assign_partition(partition: int) -> StorePartition ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L56) -Get the value of the window defined by `start` and `end` timestamps - -if the window is present in the state, else default +Assign new store partition **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `default`: default value to return if the key is not found +- `partition`: partition number **Returns**: -value or None if the key is not found and `default` is not provided +instance of `StorePartition` - + -#### WindowedTransactionState.update\_window +#### Store.revoke\_partition ```python -def update_window(start_ms: int, - end_ms: int, - value: Any, - timestamp_ms: int, - window_timestamp_ms: Optional[int] = None) -> None +def revoke_partition(partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L83) -Set a value for the window. - -This method will also update the latest observed timestamp in state partition -using the provided `timestamp`. +Revoke assigned store partition **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `value`: value of the window -- `timestamp_ms`: current message timestamp in milliseconds -- `window_timestamp_ms`: arbitrary timestamp stored with the window value +- `partition`: partition number - + -#### WindowedTransactionState.get\_latest\_timestamp +#### Store.start\_partition\_transaction ```python -def get_latest_timestamp() -> Optional[int] +def start_partition_transaction(partition: int) -> PartitionTransaction ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L69) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L101) -Get the latest observed timestamp for the current message key. +Start a new partition transaction. -Use this timestamp to determine if the arriving event is late and should be -discarded from the processing. +`PartitionTransaction` is the primary interface for working with data in Stores. + +**Arguments**: + +- `partition`: partition number **Returns**: -latest observed event timestamp in milliseconds +instance of `PartitionTransaction` - + -#### WindowedTransactionState.expire\_windows +#### Store.close ```python -def expire_windows(max_start_time: int, - delete: bool = True) -> list[tuple[tuple[int, int], Any]] +def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L81) - -Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. - -This method marks the latest found window as expired in the expiration index, -so consecutive calls may yield different results for the same "latest timestamp". +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/store.py#L119) -**Arguments**: - -- `max_start_time`: The timestamp up to which windows are considered expired, inclusive. -- `delete`: If True, expired windows will be deleted. +Close store and revoke all store partitions -**Returns**: + -A sorted list of tuples in the format `((start, end), value)`. +## quixstreams.state.base.transaction - + -#### WindowedTransactionState.get\_windows +### PartitionTransactionCache ```python -def get_windows(start_from_ms: int, - start_to_ms: int, - backwards: bool = False) -> list[tuple[tuple[int, int], Any]] +class PartitionTransactionCache() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L98) - -Get all windows that start between "start_from_ms" and "start_to_ms". - -**Arguments**: - -- `start_from_ms`: The minimal window start time, exclusive. -- `start_to_ms`: The maximum window start time, inclusive. -- `backwards`: If True, yields windows in reverse order. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L36) -**Returns**: +A cache with the data updated in the current PartitionTransaction. +It is used to read-your-own-writes before the transaction is committed to the Store. -A sorted list of tuples in the format `((start, end), value)`. +Internally, updates and deletes are separated into two separate structures +to simplify the querying over them. - + -#### WindowedTransactionState.delete\_windows +#### PartitionTransactionCache.get ```python -def delete_windows(max_start_time: int) -> None +def get(key: bytes, + prefix: bytes, + cf_name: str = "default") -> Union[bytes, Marker] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/windowed/state.py#L116) - -Delete windows from RocksDB up to the specified `max_start_time` timestamp. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L59) -This method removes all window entries that have a start time less than or equal to the given -`max_start_time`. It ensures that expired data is cleaned up efficiently without affecting -unexpired windows. +Get a value for the key. -**Arguments**: +Returns the key value if it has been updated during the transaction. -- `max_start_time`: The timestamp up to which windows should be deleted, inclusive. +If the key has already been deleted, returns "DELETED" sentinel +(we don't need to check the actual store). +If the key is not present in the cache, returns "UNDEFINED sentinel +(we need to check the store). - +:param: key: key as bytes +:param: prefix: key prefix as bytes +:param: cf_name: column family name -## quixstreams.state.rocksdb.options - + -### RocksDBOptions +#### PartitionTransactionCache.set ```python -@dataclasses.dataclass(frozen=True) -class RocksDBOptions(RocksDBOptionsType) +def set(key: bytes, value: bytes, prefix: bytes, cf_name: str = "default") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/options.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L89) -RocksDB database options. +Set a value for the key. -**Arguments**: +:param: key: key as bytes +:param: value: value as bytes +:param: prefix: key prefix as bytes +:param: cf_name: column family name -- `dumps`: function to dump data to JSON -- `loads`: function to load data from JSON -- `open_max_retries`: number of times to retry opening the database -if it's locked by another process. To disable retrying, pass 0 -- `open_retry_backoff`: number of seconds to wait between each retry. -Please see `rocksdict.Options` for a complete description of other options. - + -#### RocksDBOptions.to\_options +#### PartitionTransactionCache.delete ```python -def to_options() -> rocksdict.Options +def delete(key: Any, prefix: bytes, cf_name: str = "default") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/options.py#L54) - -Convert parameters to `rocksdict.Options` - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L102) -instance of `rocksdict.Options` +Delete a key. - +:param: key: key as bytes +:param: value: value as bytes +:param: prefix: key prefix as bytes +:param: cf_name: column family name -## quixstreams.state.rocksdb.store - + -### RocksDBStore +#### PartitionTransactionCache.is\_empty ```python -class RocksDBStore(Store) +def is_empty() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/store.py#L18) - -RocksDB-based state store. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L115) -It keeps track of individual store partitions and provides access to the -partitions' transactions. +Return True if any changes have been made (updates or deletes), otherwise +return False. - + -#### RocksDBStore.\_\_init\_\_ +#### PartitionTransactionCache.get\_column\_families ```python -def __init__( - name: str, - topic: Optional[str], - base_dir: str, - changelog_producer_factory: Optional[ChangelogProducerFactory] = None, - options: Optional[RocksDBOptionsType] = None) +def get_column_families() -> Set[str] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/store.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L122) -**Arguments**: +Get all update column families. -- `name`: a unique store name -- `topic`: a topic name for this store -- `base_dir`: path to a directory with the state -- `changelog_producer_factory`: a ChangelogProducerFactory instance -if using changelogs -- `options`: RocksDB options. If `None`, the default options will be used. + - +#### PartitionTransactionCache.get\_updates -## quixstreams.state.rocksdb.partition +```python +def get_updates(cf_name: str = "default") -> Dict[bytes, Dict[bytes, bytes]] +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L128) -### RocksDBStorePartition +Get all updated keys (excluding deleted) -```python -class RocksDBStorePartition(StorePartition) -``` +in the format "{: {: }}". -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L30) +:param: cf_name: column family name -A base class to access state in RocksDB. -It represents a single RocksDB database. + -Responsibilities: - 1. Managing access to the RocksDB instance - 2. Creating transactions to interact with data - 3. Flushing WriteBatches to the RocksDB +#### PartitionTransactionCache.get\_deletes -It opens the RocksDB on `__init__`. If the db is locked by another process, -it will retry according to `open_max_retries` and `open_retry_backoff` options. +```python +def get_deletes(cf_name: str = "default") -> Set[bytes] +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L137) -- `path`: an absolute path to the RocksDB folder -- `options`: RocksDB options. If `None`, the default options will be used. +Get all deleted keys (excluding updated) as a set. - + -#### RocksDBStorePartition.write +### PartitionTransactionStatus ```python -def write(cache: PartitionTransactionCache, - processed_offset: Optional[int], - changelog_offset: Optional[int], - batch: Optional[WriteBatch] = None) +class PartitionTransactionStatus(enum.Enum) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L115) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L144) -Write data to RocksDB + -**Arguments**: +#### STARTED -- `cache`: The modified data -- `processed_offset`: The offset processed to generate the data. -- `changelog_offset`: The changelog message offset of the data. -- `batch`: prefilled `rocksdict.WriteBatch`, optional. +Transaction is started and accepts updates - + -#### RocksDBStorePartition.get +#### PREPARED -```python -def get(key: bytes, - cf_name: str = "default") -> Union[bytes, Literal[Marker.UNDEFINED]] -``` +Transaction is prepared, it can no longer receive updates -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L180) + -Get a key from RocksDB. +#### COMPLETE -**Arguments**: +Transaction is fully completed, it cannot be used anymore -- `key`: a key encoded to `bytes` -- `default`: a default value to return if the key is not found. -- `cf_name`: rocksdb column family name. Default - "default" + -**Returns**: +#### FAILED -a value if the key is present in the DB. Otherwise, `default` +Transaction is failed, it cannot be used anymore - + -#### RocksDBStorePartition.exists +#### validate\_transaction\_status ```python -def exists(key: bytes, cf_name: str = "default") -> bool +def validate_transaction_status(*allowed: PartitionTransactionStatus) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L196) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L155) -Check if a key is present in the DB. +Check that the status of `RocksDBTransaction` is valid before calling a method -**Arguments**: + -- `key`: a key encoded to `bytes`. -- `cf_name`: rocksdb column family name. Default - "default" +### PartitionTransaction -**Returns**: +```python +class PartitionTransaction(ABC) +``` -`True` if the key is present, `False` otherwise. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L175) - +A transaction class to perform simple key-value operations like +"get", "set", "delete" and "exists" on a single storage partition. -#### RocksDBStorePartition.get\_processed\_offset + + +#### PartitionTransaction.failed ```python -def get_processed_offset() -> Optional[int] +@property +def failed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L207) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L207) -Get last processed offset for the given partition +Return `True` if transaction failed to update data at some point. + +Failed transactions cannot be re-used. **Returns**: -offset or `None` if there's no processed offset yet +bool - + -#### RocksDBStorePartition.get\_changelog\_offset +#### PartitionTransaction.completed ```python -def get_changelog_offset() -> Optional[int] +@property +def completed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L222) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L217) -Get offset that the changelog is up-to-date with. +Return `True` if transaction is successfully completed. + +Completed transactions cannot be re-used. **Returns**: -offset or `None` if there's no processed offset yet +bool - + -#### RocksDBStorePartition.close +#### PartitionTransaction.prepared ```python -def close() +@property +def prepared() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L234) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L227) -Close the underlying RocksDB +Return `True` if transaction is prepared completed. - +Prepared transactions cannot receive new updates, but can be flushed. -#### RocksDBStorePartition.path +**Returns**: + +bool + + + +#### PartitionTransaction.changelog\_topic\_partition ```python @property -def path() -> str +def changelog_topic_partition() -> Optional[Tuple[str, int]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L247) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L237) -Absolute path to RocksDB database folder +Return the changelog topic-partition for the StorePartition of this transaction. + +Returns `None` if changelog_producer is not provided. **Returns**: -file path +(topic, partition) or None - + -#### RocksDBStorePartition.destroy +#### PartitionTransaction.as\_state ```python -@classmethod -def destroy(cls, path: str) +def as_state(prefix: Any = DEFAULT_PREFIX) -> State ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L255) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L264) -Delete underlying RocksDB database +Create an instance implementing the `State` protocol to be provided -The database must be closed first. +to `StreamingDataFrame` functions. +All operations called on this State object will be prefixed with +the supplied `prefix`. -**Arguments**: +**Returns**: -- `path`: an absolute path to the RocksDB folder +an instance implementing the `State` protocol - + -#### RocksDBStorePartition.get\_column\_family\_handle +#### PartitionTransaction.get ```python -def get_column_family_handle(cf_name: str) -> ColumnFamily +@validate_transaction_status(PartitionTransactionStatus.STARTED) +def get(key: Any, + prefix: bytes, + default: Any = None, + cf_name: str = "default") -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L265) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L283) -Get a column family handle to pass to it WriteBatch. +Get a key from the store. -This method will cache the CF handle instance to avoid creating them -repeatedly. +It returns `None` if the key is not found and `default` is not provided. **Arguments**: +- `key`: key +- `prefix`: a key prefix +- `default`: default value to return if the key is not found - `cf_name`: column family name **Returns**: -instance of `rocksdict.ColumnFamily` +value or None if the key is not found and `default` is not provided - + -#### RocksDBStorePartition.get\_column\_family +#### PartitionTransaction.set ```python -def get_column_family(cf_name: str) -> Rdict +@validate_transaction_status(PartitionTransactionStatus.STARTED) +def set(key: Any, value: Any, prefix: bytes, cf_name: str = "default") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/partition.py#L286) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L319) -Get a column family instance. - -This method will cache the CF instance to avoid creating them repeatedly. +Set value for the key. **Arguments**: +- `key`: key +- `prefix`: a key prefix +- `value`: value - `cf_name`: column family name -**Returns**: + -instance of `rocksdict.Rdict` for the given column family +#### PartitionTransaction.delete - +```python +@validate_transaction_status(PartitionTransactionStatus.STARTED) +def delete(key: Any, prefix: bytes, cf_name: str = "default") +``` -## quixstreams.state.rocksdb.metadata +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L342) - +Delete value for the key. -## quixstreams.state.rocksdb +This function always returns `None`, even if value is not found. - +**Arguments**: -## quixstreams.state.rocksdb.types +- `key`: key +- `prefix`: a key prefix +- `cf_name`: column family name - + -## quixstreams.state.rocksdb.exceptions +#### PartitionTransaction.exists - +```python +@validate_transaction_status(PartitionTransactionStatus.STARTED) +def exists(key: Any, prefix: bytes, cf_name: str = "default") -> bool +``` -## quixstreams.state.metadata +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L361) - +Check if the key exists in state. -## quixstreams.state.memory.store +**Arguments**: - +- `key`: key +- `prefix`: a key prefix +- `cf_name`: column family name -### MemoryStore +**Returns**: + +True if key exists, False otherwise + + + +#### PartitionTransaction.prepare ```python -class MemoryStore(Store) +@validate_transaction_status(PartitionTransactionStatus.STARTED) +def prepare(processed_offset: Optional[int]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/store.py#L14) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L381) -In-memory state store. +Produce changelog messages to the changelog topic for all changes accumulated -It keeps track of individual store partitions and provides access to the -partitions' transactions. +in this transaction and prepare transaction to flush its state to the state +store. -Requires a full state recovery for each partition on assignment. +After successful `prepare()`, the transaction status is changed to PREPARED, +and it cannot receive updates anymore. - +If changelog is disabled for this application, no updates will be produced +to the changelog topic. -#### MemoryStore.\_\_init\_\_ +**Arguments**: + +- `processed_offset`: the offset of the latest processed message + + + +#### PartitionTransaction.flush ```python -def __init__( - name: str, - topic: Optional[str], - changelog_producer_factory: Optional[ChangelogProducerFactory] = None -) -> None +@validate_transaction_status(PartitionTransactionStatus.STARTED, + PartitionTransactionStatus.PREPARED) +def flush(processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/store.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/transaction.py#L442) + +Flush the recent updates to the database. + +It writes the WriteBatch to RocksDB and marks itself as finished. + +If writing fails, the transaction is marked as failed and +cannot be used anymore. + +>***NOTE:*** If no keys have been modified during the transaction + (i.e. no "set" or "delete" have been called at least once), it will + not flush ANY data to the database including the offset to optimize + I/O. **Arguments**: -- `name`: a unique store name -- `topic`: a topic name for this store -- `changelog_producer_factory`: a ChangelogProducerFactory instance -if using changelogs topics. +- `processed_offset`: offset of the last processed message, optional. +- `changelog_offset`: offset of the last produced changelog message, +optional. + + + +## quixstreams.state.memory @@ -7078,7 +6880,7 @@ if using changelogs topics. class MemoryStorePartition(StorePartition) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/partition.py#L36) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/partition.py#L36) Class to access in-memory state. @@ -7097,7 +6899,7 @@ def write(cache: PartitionTransactionCache, processed_offset: Optional[int], changelog_offset: Optional[int]) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/partition.py#L68) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/partition.py#L68) Write data to the state @@ -7115,7 +6917,7 @@ Write data to the state def get_processed_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/partition.py#L125) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/partition.py#L125) Get last processed offset for the given partition @@ -7131,7 +6933,7 @@ offset or `None` if there's no processed offset yet def get_changelog_offset() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/partition.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/partition.py#L132) Get offset that the changelog is up-to-date with. @@ -7149,7 +6951,7 @@ def get(key: bytes, cf_name: str = "default") -> Union[bytes, Literal[Marker.UNDEFINED]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/partition.py#L140) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/partition.py#L140) Get a key from the store @@ -7172,7 +6974,7 @@ a value if the key is present in the store. Otherwise, `default` def exists(key: bytes, cf_name: str = "default") -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/memory/partition.py#L154) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/partition.py#L154) Check if a key is present in the store. @@ -7185,606 +6987,663 @@ Check if a key is present in the store. `True` if the key is present, `False` otherwise. - + -## quixstreams.state.memory +## quixstreams.state.memory.store - - -## quixstreams.state.recovery - - + -### RecoveryPartition +### MemoryStore ```python -class RecoveryPartition() +class MemoryStore(Store) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/store.py#L14) -A changelog topic partition mapped to a respective `StorePartition` with helper -methods to determine its current recovery status. +In-memory state store. -Since `StorePartition`s do recovery directly, it also handles recovery transactions. +It keeps track of individual store partitions and provides access to the +partitions' transactions. - +Requires a full state recovery for each partition on assignment. -#### RecoveryPartition.offset + + +#### MemoryStore.\_\_init\_\_ ```python -@property -def offset() -> int +def __init__( + name: str, + topic: Optional[str], + changelog_producer_factory: Optional[ChangelogProducerFactory] = None +) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L64) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/memory/store.py#L24) -Get the changelog offset from the underlying `StorePartition`. +**Arguments**: -**Returns**: +- `name`: a unique store name +- `topic`: a topic name for this store +- `changelog_producer_factory`: a ChangelogProducerFactory instance +if using changelogs topics. -changelog offset (int) + - +## quixstreams.state -#### RecoveryPartition.needs\_recovery\_check + + +## quixstreams.state.exceptions + + + +## quixstreams.state.manager + + + +### StateStoreManager ```python -@property -def needs_recovery_check() -> bool +class StateStoreManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L30) -Determine whether to attempt recovery for underlying `StorePartition`. +Class for managing state stores and partitions. -This does NOT mean that anything actually requires recovering. +StateStoreManager is responsible for: + - reacting to rebalance callbacks + - managing the individual state stores + - providing access to store transactions - + -#### RecoveryPartition.has\_invalid\_offset +#### StateStoreManager.stores ```python @property -def has_invalid_offset() -> bool +def stores() -> Dict[Optional[str], Dict[str, Store]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L91) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L79) -Determine if the current changelog offset stored in state is invalid. +Map of registered state stores - +**Returns**: -#### RecoveryPartition.recover\_from\_changelog\_message +dict in format {topic: {store_name: store}} -```python -def recover_from_changelog_message( - changelog_message: ConfluentKafkaMessageProto) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L108) +#### StateStoreManager.recovery\_required -Recover the StorePartition using a message read from its respective changelog. +```python +@property +def recovery_required() -> bool +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L87) -- `changelog_message`: A confluent kafka message (everything as bytes) +Whether recovery needs to be done. - + -#### RecoveryPartition.set\_recovery\_consume\_position +#### StateStoreManager.using\_changelogs ```python -def set_recovery_consume_position(offset: int) +@property +def using_changelogs() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L120) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L96) -Update the recovery partition with the consumer's position (whenever +Whether the StateStoreManager is using changelog topics -an empty poll is returned during recovery). +**Returns**: -It is possible that it may be set more than once. +using changelogs, as bool -**Arguments**: + -- `offset`: the consumer's current read position of the changelog +#### StateStoreManager.do\_recovery - +```python +def do_recovery() -> None +``` -### ChangelogProducerFactory +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L108) + +Perform a state recovery, if necessary. + + + +#### StateStoreManager.stop\_recovery ```python -class ChangelogProducerFactory() +def stop_recovery() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L132) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L117) -Generates ChangelogProducers, which produce changelog messages to a StorePartition. +Stop recovery (called during app shutdown). - + -#### ChangelogProducerFactory.\_\_init\_\_ +#### StateStoreManager.get\_store ```python -def __init__(changelog_name: str, producer: RowProducer) +def get_store(topic: str, store_name: str = DEFAULT_STATE_STORE_NAME) -> Store ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L137) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L126) + +Get a store for given name and topic **Arguments**: -- `changelog_name`: changelog topic name -- `producer`: a RowProducer (not shared with `Application` instance) +- `topic`: topic name +- `store_name`: store name **Returns**: -a ChangelogWriter instance +instance of `Store` - + -#### ChangelogProducerFactory.get\_partition\_producer +#### StateStoreManager.register\_store ```python -def get_partition_producer(partition_num) -> "ChangelogProducer" +def register_store(topic_name: Optional[str], + store_name: str = DEFAULT_STATE_STORE_NAME, + store_type: Optional[StoreTypes] = None, + topic_config: Optional[TopicConfig] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L147) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L163) -Generate a ChangelogProducer for producing to a specific partition number +Register a state store to be managed by StateStoreManager. -(and thus StorePartition). +During processing, the StateStoreManager will react to rebalancing callbacks +and assign/revoke the partitions for registered stores. + +Each store can be registered only once for each topic. **Arguments**: -- `partition_num`: source topic partition number +- `topic_name`: topic name +- `store_name`: store name +- `store_type`: the storage type used for this store. +Default to StateStoreManager `default_store_type` - + -### ChangelogProducer +#### StateStoreManager.register\_windowed\_store ```python -class ChangelogProducer() +def register_windowed_store(topic_name: str, store_name: str) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L161) - -Generated for a `StorePartition` to produce state changes to its respective -kafka changelog partition. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L208) -#### ChangelogProducer.\_\_init\_\_ +Register a windowed state store to be managed by StateStoreManager. -```python -def __init__(changelog_name: str, partition: int, producer: RowProducer) -``` +During processing, the StateStoreManager will react to rebalancing callbacks +and assign/revoke the partitions for registered stores. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L167) +Each window store can be registered only once for each topic. **Arguments**: -- `changelog_name`: A changelog topic name -- `partition`: source topic partition number -- `producer`: a RowProducer (not shared with `Application` instance) +- `topic_name`: topic name +- `store_name`: store name - + -#### ChangelogProducer.produce +#### StateStoreManager.clear\_stores ```python -def produce(key: bytes, - value: Optional[bytes] = None, - headers: Optional[Headers] = None) +def clear_stores() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L190) - -Produce a message to a changelog topic partition. - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L234) -- `key`: message key (same as state key, including prefixes) -- `value`: message value (same as state value) -- `headers`: message headers (includes column family info) +Delete all state stores managed by StateStoreManager. - + -### RecoveryManager +#### StateStoreManager.on\_partition\_assign ```python -class RecoveryManager() +def on_partition_assign(topic: Optional[str], partition: int, + committed_offset: int) -> Dict[str, StorePartition] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L215) - -Manages all consumer-related aspects of recovery, including: - - assigning/revoking, pausing/resuming topic partitions (especially changelogs) - - consuming changelog messages until state is updated fully. - -Also tracks/manages `RecoveryPartitions`, which are assigned/tracked only if -recovery for that changelog partition is required. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L250) -Recovery is attempted from the `Application` after any new partition assignment. +Assign store partitions for each registered store for the given `TopicPartition` - +and return a list of assigned `StorePartition` objects. -#### RecoveryManager.partitions +**Arguments**: -```python -@property -def partitions() -> Dict[int, Dict[str, RecoveryPartition]] -``` +- `topic`: Kafka topic name +- `partition`: Kafka topic partition +- `committed_offset`: latest committed offset for the partition -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L234) +**Returns**: -Returns a mapping of assigned RecoveryPartitions in the following format: -{: {: }} +list of assigned `StorePartition` - + -#### RecoveryManager.has\_assignments +#### StateStoreManager.on\_partition\_revoke ```python -@property -def has_assignments() -> bool +def on_partition_revoke(topic: str, partition: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L242) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L276) -Whether the Application has assigned RecoveryPartitions +Revoke store partitions for each registered store for the given `TopicPartition` -**Returns**: +**Arguments**: -has assignments, as bool +- `topic`: Kafka topic name +- `partition`: Kafka topic partition - + -#### RecoveryManager.recovering +#### StateStoreManager.init ```python -@property -def recovering() -> bool +def init() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L251) - -Whether the Application is currently recovering +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L289) -**Returns**: +Initialize `StateStoreManager` and create a store directory -is recovering, as bool - + -#### RecoveryManager.register\_changelog +#### StateStoreManager.close ```python -def register_changelog(topic_name: Optional[str], - store_name: str, - topic_config: Optional[TopicConfig] = None) -> Topic +def close() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L259) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/manager.py#L296) -Register a changelog Topic with the TopicManager. +Close all registered stores -**Arguments**: + -- `topic_name`: source topic name -- `store_name`: name of the store +## quixstreams.state.metadata - + -#### RecoveryManager.do\_recovery +## quixstreams.state.recovery + + + +### RecoveryPartition ```python -def do_recovery() +class RecoveryPartition() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L277) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L26) -If there are any active RecoveryPartitions, do a recovery procedure. +A changelog topic partition mapped to a respective `StorePartition` with helper +methods to determine its current recovery status. -After, will resume normal `Application` processing. +Since `StorePartition`s do recovery directly, it also handles recovery transactions. - + -#### RecoveryManager.assign\_partition +#### RecoveryPartition.offset ```python -def assign_partition(topic: Optional[str], partition: int, - committed_offset: int, - store_partitions: Dict[str, StorePartition]) +@property +def offset() -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L332) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L64) -Assigns `StorePartition`s (as `RecoveryPartition`s) ONLY IF recovery required. +Get the changelog offset from the underlying `StorePartition`. -Pauses active consumer partitions as needed. +**Returns**: - +changelog offset (int) -#### RecoveryManager.revoke\_partition + + +#### RecoveryPartition.needs\_recovery\_check ```python -def revoke_partition(partition_num: int) +@property +def needs_recovery_check() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/recovery.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L80) -revoke ALL StorePartitions (across all Stores) for a given partition number +Determine whether to attempt recovery for underlying `StorePartition`. -**Arguments**: +This does NOT mean that anything actually requires recovering. -- `partition_num`: partition number of source topic + - +#### RecoveryPartition.has\_invalid\_offset -## quixstreams.state +```python +@property +def has_invalid_offset() -> bool +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L91) -## quixstreams.state.types +Determine if the current changelog offset stored in state is invalid. - + -### WindowedState +#### RecoveryPartition.recover\_from\_changelog\_message ```python -class WindowedState(Protocol) +def recover_from_changelog_message( + changelog_message: ConfluentKafkaMessageProto) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L7) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L108) -A windowed state to be provided into `StreamingDataFrame` window functions. +Recover the StorePartition using a message read from its respective changelog. - +**Arguments**: -#### WindowedState.get\_window +- `changelog_message`: A confluent kafka message (everything as bytes) + + + +#### RecoveryPartition.set\_recovery\_consume\_position ```python -def get_window(start_ms: int, - end_ms: int, - default: Any = None) -> Optional[Any] +def set_recovery_consume_position(offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L120) -Get the value of the window defined by `start` and `end` timestamps +Update the recovery partition with the consumer's position (whenever -if the window is present in the state, else default +an empty poll is returned during recovery). + +It is possible that it may be set more than once. **Arguments**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `default`: default value to return if the key is not found +- `offset`: the consumer's current read position of the changelog -**Returns**: + -value or None if the key is not found and `default` is not provided +### ChangelogProducerFactory - +```python +class ChangelogProducerFactory() +``` -#### WindowedState.update\_window +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L132) + +Generates ChangelogProducers, which produce changelog messages to a StorePartition. + + + +#### ChangelogProducerFactory.\_\_init\_\_ ```python -def update_window(start_ms: int, - end_ms: int, - value: Any, - timestamp_ms: int, - window_timestamp_ms: Optional[int] = None) +def __init__(changelog_name: str, producer: RowProducer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L137) -Set a value for the window. +**Arguments**: -This method will also update the latest observed timestamp in state partition -using the provided `timestamp`. +- `changelog_name`: changelog topic name +- `producer`: a RowProducer (not shared with `Application` instance) -**Arguments**: +**Returns**: -- `start_ms`: start of the window in milliseconds -- `end_ms`: end of the window in milliseconds -- `value`: value of the window -- `timestamp_ms`: current message timestamp in milliseconds -- `window_timestamp_ms`: arbitrary timestamp stored with the window value +a ChangelogWriter instance - + -#### WindowedState.get\_latest\_timestamp +#### ChangelogProducerFactory.get\_partition\_producer ```python -def get_latest_timestamp() -> Optional[int] +def get_partition_producer(partition_num) -> "ChangelogProducer" ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L147) -Get the latest observed timestamp for the current state partition. +Generate a ChangelogProducer for producing to a specific partition number -Use this timestamp to determine if the arriving event is late and should be -discarded from the processing. +(and thus StorePartition). -**Returns**: +**Arguments**: -latest observed event timestamp in milliseconds +- `partition_num`: source topic partition number - + -#### WindowedState.expire\_windows +### ChangelogProducer ```python -def expire_windows(max_start_time: int, - delete: bool = True) -> list[tuple[tuple[int, int], Any]] +class ChangelogProducer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L59) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L161) -Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. +Generated for a `StorePartition` to produce state changes to its respective +kafka changelog partition. -This method marks the latest found window as expired in the expiration index, -so consecutive calls may yield different results for the same "latest timestamp". + -**Arguments**: +#### ChangelogProducer.\_\_init\_\_ -- `max_start_time`: The timestamp up to which windows are considered expired, inclusive. -- `delete`: If True, expired windows will be deleted. +```python +def __init__(changelog_name: str, partition: int, producer: RowProducer) +``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L167) -A sorted list of tuples in the format `((start, end), value)`. +**Arguments**: - +- `changelog_name`: A changelog topic name +- `partition`: source topic partition number +- `producer`: a RowProducer (not shared with `Application` instance) -#### WindowedState.delete\_windows + + +#### ChangelogProducer.produce ```python -def delete_windows(max_start_time: int) -> None +def produce(key: bytes, + value: Optional[bytes] = None, + headers: Optional[Headers] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L74) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L190) -Delete windows from RocksDB up to the specified `max_start_time` timestamp. - -This method removes all window entries that have a start time less than or equal to the given -`max_start_time`. It ensures that expired data is cleaned up efficiently without affecting -unexpired windows. +Produce a message to a changelog topic partition. **Arguments**: -- `max_start_time`: The timestamp up to which windows should be deleted, inclusive. +- `key`: message key (same as state key, including prefixes) +- `value`: message value (same as state value) +- `headers`: message headers (includes column family info) - + -#### WindowedState.get\_windows +### RecoveryManager ```python -def get_windows(start_from_ms: int, - start_to_ms: int, - backwards: bool = False) -> list[tuple[tuple[int, int], Any]] +class RecoveryManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L86) - -Get all windows that start between "start_from_ms" and "start_to_ms". - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L215) -- `start_from_ms`: The minimal window start time, exclusive. -- `start_to_ms`: The maximum window start time, inclusive. -- `backwards`: If True, yields windows in reverse order. +Manages all consumer-related aspects of recovery, including: + - assigning/revoking, pausing/resuming topic partitions (especially changelogs) + - consuming changelog messages until state is updated fully. -**Returns**: +Also tracks/manages `RecoveryPartitions`, which are assigned/tracked only if +recovery for that changelog partition is required. -A sorted list of tuples in the format `((start, end), value)`. +Recovery is attempted from the `Application` after any new partition assignment. - + -### WindowedPartitionTransaction +#### RecoveryManager.partitions ```python -class WindowedPartitionTransaction(Protocol) +@property +def partitions() -> Dict[int, Dict[str, RecoveryPartition]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L234) - +Returns a mapping of assigned RecoveryPartitions in the following format: +{: {: }} -#### WindowedPartitionTransaction.failed + + +#### RecoveryManager.has\_assignments ```python @property -def failed() -> bool +def has_assignments() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L102) - -Return `True` if transaction failed to update data at some point. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L242) -Failed transactions cannot be re-used. +Whether the Application has assigned RecoveryPartitions **Returns**: -bool +has assignments, as bool - + -#### WindowedPartitionTransaction.completed +#### RecoveryManager.recovering ```python @property -def completed() -> bool +def recovering() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L112) - -Return `True` if transaction is successfully completed. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L251) -Completed transactions cannot be re-used. +Whether the Application is currently recovering **Returns**: -bool +is recovering, as bool - + -#### WindowedPartitionTransaction.prepared +#### RecoveryManager.register\_changelog ```python -@property -def prepared() -> bool +def register_changelog(topic_name: Optional[str], + store_name: str, + topic_config: Optional[TopicConfig] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L122) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L259) -Return `True` if transaction is prepared completed. +Register a changelog Topic with the TopicManager. -Prepared transactions cannot receive new updates, but can be flushed. +**Arguments**: -**Returns**: +- `topic_name`: source topic name +- `store_name`: name of the store -bool + - +#### RecoveryManager.do\_recovery -#### WindowedPartitionTransaction.prepare +```python +def do_recovery() +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L277) + +If there are any active RecoveryPartitions, do a recovery procedure. + +After, will resume normal `Application` processing. + + + +#### RecoveryManager.assign\_partition ```python -def prepare(processed_offset: Optional[int]) +def assign_partition(topic: Optional[str], partition: int, + committed_offset: int, + store_partitions: Dict[str, StorePartition]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L131) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L332) -Produce changelog messages to the changelog topic for all changes accumulated +Assigns `StorePartition`s (as `RecoveryPartition`s) ONLY IF recovery required. -in this transaction and prepare transcation to flush its state to the state -store. +Pauses active consumer partitions as needed. -After successful `prepare()`, the transaction status is changed to PREPARED, -and it cannot receive updates anymore. + -If changelog is disabled for this application, no updates will be produced -to the changelog topic. +#### RecoveryManager.revoke\_partition + +```python +def revoke_partition(partition_num: int) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/recovery.py#L409) + +revoke ALL StorePartitions (across all Stores) for a given partition number **Arguments**: -- `processed_offset`: the offset of the latest processed message +- `partition_num`: partition number of source topic - + -#### WindowedPartitionTransaction.get\_window +## quixstreams.state.serialization + + + +## quixstreams.state.types + + + +### WindowedState + +```python +class WindowedState(Protocol) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L7) + +A windowed state to be provided into `StreamingDataFrame` window functions. + + + +#### WindowedState.get\_window ```python def get_window(start_ms: int, end_ms: int, - prefix: bytes, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L148) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L12) Get the value of the window defined by `start` and `end` timestamps @@ -7794,23 +7653,25 @@ if the window is present in the state, else default - `start_ms`: start of the window in milliseconds - `end_ms`: end of the window in milliseconds -- `prefix`: a key prefix - `default`: default value to return if the key is not found **Returns**: value or None if the key is not found and `default` is not provided - + -#### WindowedPartitionTransaction.update\_window +#### WindowedState.update\_window ```python -def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int, - prefix: bytes) +def update_window(start_ms: int, + end_ms: int, + value: Any, + timestamp_ms: int, + window_timestamp_ms: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L167) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L26) Set a value for the window. @@ -7823,21 +7684,19 @@ using the provided `timestamp`. - `end_ms`: end of the window in milliseconds - `value`: value of the window - `timestamp_ms`: current message timestamp in milliseconds -- `prefix`: a key prefix +- `window_timestamp_ms`: arbitrary timestamp stored with the window value - + -#### WindowedPartitionTransaction.get\_latest\_timestamp +#### WindowedState.get\_latest\_timestamp ```python -def get_latest_timestamp(prefix: bytes) -> int +def get_latest_timestamp() -> Optional[int] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L184) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L48) -Get the latest observed timestamp for the current state prefix - -(same as message key). +Get the latest observed timestamp for the current state partition. Use this timestamp to determine if the arriving event is late and should be discarded from the processing. @@ -7846,17 +7705,16 @@ discarded from the processing. latest observed event timestamp in milliseconds - + -#### WindowedPartitionTransaction.expire\_windows +#### WindowedState.expire\_windows ```python def expire_windows(max_start_time: int, - prefix: bytes, delete: bool = True) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L196) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L59) Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. @@ -7866,22 +7724,21 @@ so consecutive calls may yield different results for the same "latest timestamp" **Arguments**: - `max_start_time`: The timestamp up to which windows are considered expired, inclusive. -- `prefix`: The key prefix for filtering windows. - `delete`: If True, expired windows will be deleted. **Returns**: A sorted list of tuples in the format `((start, end), value)`. - + -#### WindowedPartitionTransaction.delete\_windows +#### WindowedState.delete\_windows ```python -def delete_windows(max_start_time: int, prefix: bytes) -> None +def delete_windows(max_start_time: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L212) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L74) Delete windows from RocksDB up to the specified `max_start_time` timestamp. @@ -7892,2671 +7749,2928 @@ unexpired windows. **Arguments**: - `max_start_time`: The timestamp up to which windows should be deleted, inclusive. -- `prefix`: The key prefix used to identify and filter relevant windows. - + -#### WindowedPartitionTransaction.get\_windows +#### WindowedState.get\_windows ```python def get_windows(start_from_ms: int, start_to_ms: int, - prefix: bytes, backwards: bool = False) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L225) - -Get all windows that start between "start_from_ms" and "start_to_ms" +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L86) -within the specified prefix. +Get all windows that start between "start_from_ms" and "start_to_ms". **Arguments**: - `start_from_ms`: The minimal window start time, exclusive. - `start_to_ms`: The maximum window start time, inclusive. -- `prefix`: The key prefix for filtering windows. - `backwards`: If True, yields windows in reverse order. **Returns**: A sorted list of tuples in the format `((start, end), value)`. - + -#### WindowedPartitionTransaction.flush +### WindowedPartitionTransaction ```python -def flush(processed_offset: Optional[int] = None, - changelog_offset: Optional[int] = None) +class WindowedPartitionTransaction(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L244) - -Flush the recent updates to the storage. - -**Arguments**: - -- `processed_offset`: offset of the last processed message, optional. -- `changelog_offset`: offset of the last produced changelog message, -optional. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L100) - + -#### WindowedPartitionTransaction.changelog\_topic\_partition +#### WindowedPartitionTransaction.failed ```python @property -def changelog_topic_partition() -> Optional[Tuple[str, int]] +def failed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L258) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L102) -Return the changelog topic-partition for the StorePartition of this transaction. +Return `True` if transaction failed to update data at some point. -Returns `None` if changelog_producer is not provided. +Failed transactions cannot be re-used. **Returns**: -(topic, partition) or None +bool - + -### PartitionRecoveryTransaction +#### WindowedPartitionTransaction.completed ```python -class PartitionRecoveryTransaction(Protocol) +@property +def completed() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L272) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L112) -A class for managing recovery for a StorePartition from a changelog message +Return `True` if transaction is successfully completed. - +Completed transactions cannot be re-used. -#### PartitionRecoveryTransaction.flush +**Returns**: + +bool + + + +#### WindowedPartitionTransaction.prepared ```python -def flush() +@property +def prepared() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/types.py#L279) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L122) -Flush the recovery update to the storage. - - +Return `True` if transaction is prepared completed. -## quixstreams.state.exceptions +Prepared transactions cannot receive new updates, but can be flushed. - +**Returns**: -## quixstreams.state.manager +bool - + -### StateStoreManager +#### WindowedPartitionTransaction.prepare ```python -class StateStoreManager() +def prepare(processed_offset: Optional[int]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L30) - -Class for managing state stores and partitions. - -StateStoreManager is responsible for: - - reacting to rebalance callbacks - - managing the individual state stores - - providing access to store transactions - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L131) -#### StateStoreManager.stores +Produce changelog messages to the changelog topic for all changes accumulated -```python -@property -def stores() -> Dict[Optional[str], Dict[str, Store]] -``` +in this transaction and prepare transcation to flush its state to the state +store. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L79) +After successful `prepare()`, the transaction status is changed to PREPARED, +and it cannot receive updates anymore. -Map of registered state stores +If changelog is disabled for this application, no updates will be produced +to the changelog topic. -**Returns**: +**Arguments**: -dict in format {topic: {store_name: store}} +- `processed_offset`: the offset of the latest processed message - + -#### StateStoreManager.recovery\_required +#### WindowedPartitionTransaction.get\_window ```python -@property -def recovery_required() -> bool +def get_window(start_ms: int, + end_ms: int, + prefix: bytes, + default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L87) - -Whether recovery needs to be done. - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L148) -#### StateStoreManager.using\_changelogs +Get the value of the window defined by `start` and `end` timestamps -```python -@property -def using_changelogs() -> bool -``` +if the window is present in the state, else default -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L96) +**Arguments**: -Whether the StateStoreManager is using changelog topics +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `prefix`: a key prefix +- `default`: default value to return if the key is not found **Returns**: -using changelogs, as bool +value or None if the key is not found and `default` is not provided - + -#### StateStoreManager.do\_recovery +#### WindowedPartitionTransaction.update\_window ```python -def do_recovery() -> None +def update_window(start_ms: int, end_ms: int, value: Any, timestamp_ms: int, + prefix: bytes) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L108) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L167) -Perform a state recovery, if necessary. +Set a value for the window. - +This method will also update the latest observed timestamp in state partition +using the provided `timestamp`. -#### StateStoreManager.stop\_recovery +**Arguments**: -```python -def stop_recovery() -> None -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L117) - -Stop recovery (called during app shutdown). +- `start_ms`: start of the window in milliseconds +- `end_ms`: end of the window in milliseconds +- `value`: value of the window +- `timestamp_ms`: current message timestamp in milliseconds +- `prefix`: a key prefix - + -#### StateStoreManager.get\_store +#### WindowedPartitionTransaction.get\_latest\_timestamp ```python -def get_store(topic: str, store_name: str = DEFAULT_STATE_STORE_NAME) -> Store +def get_latest_timestamp(prefix: bytes) -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L126) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L184) -Get a store for given name and topic +Get the latest observed timestamp for the current state prefix -**Arguments**: +(same as message key). -- `topic`: topic name -- `store_name`: store name +Use this timestamp to determine if the arriving event is late and should be +discarded from the processing. **Returns**: -instance of `Store` +latest observed event timestamp in milliseconds - + -#### StateStoreManager.register\_store +#### WindowedPartitionTransaction.expire\_windows ```python -def register_store(topic_name: Optional[str], - store_name: str = DEFAULT_STATE_STORE_NAME, - store_type: Optional[StoreTypes] = None, - topic_config: Optional[TopicConfig] = None) -> None +def expire_windows(max_start_time: int, + prefix: bytes, + delete: bool = True) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L163) - -Register a state store to be managed by StateStoreManager. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L196) -During processing, the StateStoreManager will react to rebalancing callbacks -and assign/revoke the partitions for registered stores. +Get all expired windows from RocksDB up to the specified `max_start_time` timestamp. -Each store can be registered only once for each topic. +This method marks the latest found window as expired in the expiration index, +so consecutive calls may yield different results for the same "latest timestamp". **Arguments**: -- `topic_name`: topic name -- `store_name`: store name -- `store_type`: the storage type used for this store. -Default to StateStoreManager `default_store_type` +- `max_start_time`: The timestamp up to which windows are considered expired, inclusive. +- `prefix`: The key prefix for filtering windows. +- `delete`: If True, expired windows will be deleted. - +**Returns**: -#### StateStoreManager.register\_windowed\_store +A sorted list of tuples in the format `((start, end), value)`. + + + +#### WindowedPartitionTransaction.delete\_windows ```python -def register_windowed_store(topic_name: str, store_name: str) -> None +def delete_windows(max_start_time: int, prefix: bytes) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L208) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L212) -Register a windowed state store to be managed by StateStoreManager. - -During processing, the StateStoreManager will react to rebalancing callbacks -and assign/revoke the partitions for registered stores. +Delete windows from RocksDB up to the specified `max_start_time` timestamp. -Each window store can be registered only once for each topic. +This method removes all window entries that have a start time less than or equal to the given +`max_start_time`. It ensures that expired data is cleaned up efficiently without affecting +unexpired windows. **Arguments**: -- `topic_name`: topic name -- `store_name`: store name - - - -#### StateStoreManager.clear\_stores - -```python -def clear_stores() -> None -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L234) - -Delete all state stores managed by StateStoreManager. +- `max_start_time`: The timestamp up to which windows should be deleted, inclusive. +- `prefix`: The key prefix used to identify and filter relevant windows. - + -#### StateStoreManager.on\_partition\_assign +#### WindowedPartitionTransaction.get\_windows ```python -def on_partition_assign(topic: Optional[str], partition: int, - committed_offset: int) -> Dict[str, StorePartition] +def get_windows(start_from_ms: int, + start_to_ms: int, + prefix: bytes, + backwards: bool = False) -> list[tuple[tuple[int, int], Any]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L250) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L225) -Assign store partitions for each registered store for the given `TopicPartition` +Get all windows that start between "start_from_ms" and "start_to_ms" -and return a list of assigned `StorePartition` objects. +within the specified prefix. **Arguments**: -- `topic`: Kafka topic name -- `partition`: Kafka topic partition -- `committed_offset`: latest committed offset for the partition +- `start_from_ms`: The minimal window start time, exclusive. +- `start_to_ms`: The maximum window start time, inclusive. +- `prefix`: The key prefix for filtering windows. +- `backwards`: If True, yields windows in reverse order. **Returns**: -list of assigned `StorePartition` +A sorted list of tuples in the format `((start, end), value)`. - + -#### StateStoreManager.on\_partition\_revoke +#### WindowedPartitionTransaction.flush ```python -def on_partition_revoke(topic: str, partition: int) -> None +def flush(processed_offset: Optional[int] = None, + changelog_offset: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L276) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L244) -Revoke store partitions for each registered store for the given `TopicPartition` +Flush the recent updates to the storage. **Arguments**: -- `topic`: Kafka topic name -- `partition`: Kafka topic partition +- `processed_offset`: offset of the last processed message, optional. +- `changelog_offset`: offset of the last produced changelog message, +optional. - + -#### StateStoreManager.init +#### WindowedPartitionTransaction.changelog\_topic\_partition ```python -def init() -> None +@property +def changelog_topic_partition() -> Optional[Tuple[str, int]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L289) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L258) -Initialize `StateStoreManager` and create a store directory +Return the changelog topic-partition for the StorePartition of this transaction. +Returns `None` if changelog_producer is not provided. - +**Returns**: -#### StateStoreManager.close +(topic, partition) or None + + + +### PartitionRecoveryTransaction ```python -def close() -> None +class PartitionRecoveryTransaction(Protocol) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/manager.py#L296) - -Close all registered stores +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L272) - +A class for managing recovery for a StorePartition from a changelog message -## quixstreams.state.serialization + - +#### PartitionRecoveryTransaction.flush -## quixstreams.state.base.store +```python +def flush() +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/types.py#L279) -### Store +Flush the recovery update to the storage. -```python -class Store(ABC) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L15) +## quixstreams.utils -Abstract state store. + -It keeps track of individual store partitions and provides access to the -partitions' transactions. +## quixstreams.utils.json - + -#### Store.topic +#### dumps ```python -@property -def topic() -> Optional[str] +def dumps(value: Any) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L35) - -Topic name +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/utils/json.py#L8) - +Serialize to JSON using `orjson` package. -#### Store.name +**Arguments**: -```python -@property -def name() -> str -``` +- `value`: value to serialize to JSON -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L42) +**Returns**: -Store name +bytes - + -#### Store.partitions +#### loads ```python -@property -def partitions() -> Dict[int, StorePartition] +def loads(value: bytes) -> Any ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L49) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/utils/json.py#L18) -Mapping of assigned store partitions +Deserialize from JSON using `orjson` package. + +Main differences: +- It returns `bytes` +- It doesn't allow non-str keys in dictionaries + +**Arguments**: + +- `value`: value to deserialize from **Returns**: -dict of "{partition: }" +object - + -#### Store.assign\_partition +## quixstreams.utils.dicts + + + +#### dict\_values ```python -def assign_partition(partition: int) -> StorePartition +def dict_values(d: object) -> List ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L56) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/utils/dicts.py#L4) -Assign new store partition +Recursively unpacks a set of nested dicts to get a flattened list of leaves, + +where "leaves" are the first non-dict item. + +i.e {"a": {"b": {"c": 1}, "d": 2}, "e": 3} becomes [1, 2, 3] **Arguments**: -- `partition`: partition number +- `d`: initially, a dict (with potentially nested dicts) **Returns**: -instance of `StorePartition` - - +a list with all the leaves of the various contained dicts -#### Store.revoke\_partition + -```python -def revoke_partition(partition: int) -``` +## quixstreams.utils.settings -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L83) + -Revoke assigned store partition +### BaseSettings -**Arguments**: +```python +class BaseSettings(_BaseSettings) +``` -- `partition`: partition number +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/utils/settings.py#L10) - + -#### Store.start\_partition\_transaction +#### BaseSettings.as\_dict ```python -def start_partition_transaction(partition: int) -> PartitionTransaction +def as_dict(plaintext_secrets: bool = False, + include: Optional[Set[str]] = None) -> dict ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L101) - -Start a new partition transaction. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/utils/settings.py#L18) -`PartitionTransaction` is the primary interface for working with data in Stores. +Dump any non-empty config values as a dictionary. **Arguments**: -- `partition`: partition number +- `plaintext_secrets`: whether secret values are plaintext or obscured (***) +- `include`: optional list of fields to be included in the dictionary **Returns**: -instance of `PartitionTransaction` - - +a dictionary -#### Store.close + -```python -def close() -``` +## quixstreams.checkpointing.exceptions -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/store.py#L119) + -Close store and revoke all store partitions +## quixstreams.checkpointing - + -## quixstreams.state.base.partition +## quixstreams.checkpointing.checkpoint - + -### StorePartition +### BaseCheckpoint ```python -class StorePartition(ABC) +class BaseCheckpoint() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L29) -A base class to access state in the underlying storage. -It represents a single instance of some storage (e.g. a single database for -the persistent storage). +Base class to keep track of state updates and consumer offsets and to checkpoint these +updates on schedule. - +Two implementations exist: + * one for checkpointing the Application in quixstreams/checkpoint/checkpoint.py + * one for checkpointing the kafka source in quixstreams/sources/kafka/checkpoint.py -#### StorePartition.get\_processed\_offset + + +#### BaseCheckpoint.expired ```python -@abstractmethod -def get_processed_offset() -> Optional[int] +def expired() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L57) - -Get last processed offset for the given partition - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L58) -offset or `None` if there's no processed offset yet +Returns `True` if checkpoint deadline has expired OR +if the total number of processed offsets exceeded the "commit_every" limit +when it's defined. - + -#### StorePartition.get\_changelog\_offset +#### BaseCheckpoint.empty ```python -@abstractmethod -def get_changelog_offset() -> Optional[int] +def empty() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L65) - -Get offset that the changelog is up-to-date with. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L68) -**Returns**: +Returns `True` if checkpoint doesn't have any offsets stored yet. -offset or `None` if there's no processed offset yet - + -#### StorePartition.write +#### BaseCheckpoint.store\_offset ```python -@abstractmethod -def write(cache: PartitionTransactionCache, processed_offset: Optional[int], - changelog_offset: Optional[int]) +def store_offset(topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L73) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L75) -Update the state with data from the update cache +Store the offset of the processed message to the checkpoint. **Arguments**: -- `cache`: The modified data -- `processed_offset`: The offset processed to generate the data. -- `changelog_offset`: The changelog message offset of the data. +- `topic`: topic name +- `partition`: partition number +- `offset`: message offset - + -#### StorePartition.get +#### BaseCheckpoint.close ```python @abstractmethod -def get(key: bytes, - cf_name: str = "default") -> Union[bytes, Literal[Marker.UNDEFINED]] +def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L88) - -Get a key from the store - -**Arguments**: - -- `key`: a key encoded to `bytes` -- `default`: a default value to return if the key is not found. -- `cf_name`: rocksdb column family name. Default - "default" +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L102) -**Returns**: +Perform cleanup (when the checkpoint is empty) instead of committing. -a value if the key is present in the store. Otherwise, `default` +Needed for exactly-once, as Kafka transactions are timeboxed. - + -#### StorePartition.exists +#### BaseCheckpoint.commit ```python @abstractmethod -def exists(key: bytes, cf_name: str = "default") -> bool +def commit() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L101) - -Check if a key is present in the store. - -**Arguments**: - -- `key`: a key encoded to `bytes`. -- `cf_name`: rocksdb column family name. Default - "default" - -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L110) -`True` if the key is present, `False` otherwise. +Commit the checkpoint. - + -#### StorePartition.begin +### Checkpoint ```python -def begin() -> PartitionTransaction +class Checkpoint(BaseCheckpoint) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L110) - -Start a new `PartitionTransaction` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L117) -Using `PartitionTransaction` is a recommended way for accessing the data. +Checkpoint implementation used by the application - + -#### StorePartition.recover\_from\_changelog\_message +#### Checkpoint.get\_store\_transaction ```python -def recover_from_changelog_message( - changelog_message: ConfluentKafkaMessageProto, - committed_offset: int) -> None +def get_store_transaction( + topic: str, + partition: int, + store_name: str = DEFAULT_STATE_STORE_NAME) -> PartitionTransaction ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/partition.py#L123) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L147) -Updates state from a given changelog message. +Get a PartitionTransaction for the given store, topic and partition. + +It will return already started transaction if there's one. **Arguments**: -- `changelog_message`: A raw Confluent message read from a changelog topic. -- `committed_offset`: latest committed offset for the partition +- `topic`: topic name +- `partition`: partition number +- `store_name`: store name - +**Returns**: -## quixstreams.state.base.transaction +instance of `PartitionTransaction` - + -### PartitionTransactionCache +#### Checkpoint.close ```python -class PartitionTransactionCache() +def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L36) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L170) -A cache with the data updated in the current PartitionTransaction. -It is used to read-your-own-writes before the transaction is committed to the Store. +Perform cleanup (when the checkpoint is empty) instead of committing. -Internally, updates and deletes are separated into two separate structures -to simplify the querying over them. +Needed for exactly-once, as Kafka transactions are timeboxed. - + -#### PartitionTransactionCache.get +#### Checkpoint.commit ```python -def get(key: bytes, - prefix: bytes, - cf_name: str = "default") -> Union[bytes, Marker] +def commit() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L59) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/checkpointing/checkpoint.py#L179) -Get a value for the key. +Commit the checkpoint. -Returns the key value if it has been updated during the transaction. +This method will: + 1. Produce the changelogs for each state store + 2. Flush the producer to ensure everything is delivered. + 3. Commit topic offsets. + 4. Flush each state store partition to the disk. -If the key has already been deleted, returns "DELETED" sentinel -(we don't need to check the actual store). -If the key is not present in the cache, returns "UNDEFINED sentinel -(we need to check the store). + -:param: key: key as bytes -:param: prefix: key prefix as bytes -:param: cf_name: column family name +## quixstreams.types + - +## quixstreams.processing -#### PartitionTransactionCache.set + -```python -def set(key: bytes, value: bytes, prefix: bytes, cf_name: str = "default") -``` +## quixstreams.processing.context + + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L89) +### ProcessingContext -Set a value for the key. +```python +@dataclasses.dataclass +class ProcessingContext() +``` -:param: key: key as bytes -:param: value: value as bytes -:param: prefix: key prefix as bytes -:param: cf_name: column family name +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/context.py#L23) +A class to share processing-related objects +between `Application` and `StreamingDataFrame` instances. - + -#### PartitionTransactionCache.delete +#### ProcessingContext.store\_offset ```python -def delete(key: Any, prefix: bytes, cf_name: str = "default") +def store_offset(topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L102) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/context.py#L47) -Delete a key. +Store the offset of the processed message to the checkpoint. -:param: key: key as bytes -:param: value: value as bytes -:param: prefix: key prefix as bytes -:param: cf_name: column family name +**Arguments**: +- `topic`: topic name +- `partition`: partition number +- `offset`: message offset - + -#### PartitionTransactionCache.is\_empty +#### ProcessingContext.init\_checkpoint ```python -def is_empty() -> bool +def init_checkpoint() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L115) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/context.py#L57) -Return True if any changes have been made (updates or deletes), otherwise -return False. +Initialize a new checkpoint - + -#### PartitionTransactionCache.get\_column\_families +#### ProcessingContext.commit\_checkpoint ```python -def get_column_families() -> Set[str] +def commit_checkpoint(force: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L122) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/context.py#L72) -Get all update column families. +Attempts finalizing the current Checkpoint only if the Checkpoint is "expired", - +or `force=True` is passed, otherwise do nothing. -#### PartitionTransactionCache.get\_updates +To finalize: the Checkpoint will be committed if it has any stored offsets, +else just close it. A new Checkpoint is then created. -```python -def get_updates(cf_name: str = "default") -> Dict[bytes, Dict[bytes, bytes]] -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L128) +- `force`: if `True`, commit the Checkpoint before its expiration deadline. -Get all updated keys (excluding deleted) + -in the format "{: {: }}". +## quixstreams.processing.pausing -:param: cf_name: column family name + - - - -#### PartitionTransactionCache.get\_deletes +### PausingManager ```python -def get_deletes(cf_name: str = "default") -> Set[bytes] +class PausingManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L137) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/pausing.py#L15) -Get all deleted keys (excluding updated) as a set. +A class to temporarily pause topic partitions and resume them after +the timeout is elapsed. - + -### PartitionTransactionStatus +#### PausingManager.pause ```python -class PartitionTransactionStatus(enum.Enum) +def pause(topic: str, partition: int, offset_to_seek: int, + resume_after: float) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L144) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/pausing.py#L28) - +Pause the topic-partition for a certain period of time. -#### STARTED +This method is supposed to be called in case of backpressure from Sinks. -Transaction is started and accepts updates + - +#### PausingManager.is\_paused -#### PREPARED +```python +def is_paused(topic: str, partition: int) -> bool +``` -Transaction is prepared, it can no longer receive updates +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/pausing.py#L68) - +Check if the topic-partition is already paused -#### COMPLETE + -Transaction is fully completed, it cannot be used anymore +#### PausingManager.resume\_if\_ready - +```python +def resume_if_ready() +``` -#### FAILED +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/pausing.py#L74) -Transaction is failed, it cannot be used anymore +Resume consuming from topic-partitions after the wait period has elapsed. - + -#### validate\_transaction\_status +#### PausingManager.revoke ```python -def validate_transaction_status(*allowed: PartitionTransactionStatus) +def revoke(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L155) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/processing/pausing.py#L94) -Check that the status of `RocksDBTransaction` is valid before calling a method +Remove partition from the list of paused TPs if it's revoked - + -### PartitionTransaction +## quixstreams.sinks.base.exceptions + + + +### SinkBackpressureError ```python -class PartitionTransaction(ABC) +class SinkBackpressureError(QuixException) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L175) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/exceptions.py#L6) -A transaction class to perform simple key-value operations like -"get", "set", "delete" and "exists" on a single storage partition. +An exception to be raised by Sinks during flush() call - +to signal a backpressure event to the application. -#### PartitionTransaction.failed +When raised, the app will drop the accumulated sink batch, +pause the corresponding topic partition for +a timeout specified in `retry_after`, and resume it when it's elapsed. -```python -@property -def failed() -> bool -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L207) +- `retry_after`: a timeout in seconds to pause for +- `topic`: a topic name to pause +- `partition`: a partition number to pause -Return `True` if transaction failed to update data at some point. + -Failed transactions cannot be re-used. +## quixstreams.sinks.base.manager -**Returns**: + -bool +## quixstreams.sinks.base - + -#### PartitionTransaction.completed +## quixstreams.sinks.base.batch + + + +### SinkBatch ```python -@property -def completed() -> bool +class SinkBatch() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L217) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/batch.py#L12) -Return `True` if transaction is successfully completed. +A batch to accumulate processed data by `BatchingSink` between the checkpoints. -Completed transactions cannot be re-used. +Batches are created automatically by the implementations of `BatchingSink`. -**Returns**: +**Arguments**: -bool +- `topic`: a topic name +- `partition`: a partition number - + -#### PartitionTransaction.prepared +#### SinkBatch.iter\_chunks ```python -@property -def prepared() -> bool +def iter_chunks(n: int) -> Iterable[Iterable[SinkItem]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L227) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/batch.py#L69) -Return `True` if transaction is prepared completed. +Iterate over batch data in chunks of length n. +The last batch may be shorter. -Prepared transactions cannot receive new updates, but can be flushed. + -**Returns**: +## quixstreams.sinks.base.item -bool + - +## quixstreams.sinks.base.sink -#### PartitionTransaction.changelog\_topic\_partition + + +### BaseSink ```python -@property -def changelog_topic_partition() -> Optional[Tuple[str, int]] +class BaseSink(abc.ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L237) - -Return the changelog topic-partition for the StorePartition of this transaction. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L11) -Returns `None` if changelog_producer is not provided. +This is a base class for all sinks. -**Returns**: +Subclass it and implement its methods to create your own sink. -(topic, partition) or None +Note that Sinks are currently in beta, and their design may change over time. - + -#### PartitionTransaction.as\_state +#### BaseSink.flush ```python -def as_state(prefix: Any = DEFAULT_PREFIX) -> State +@abc.abstractmethod +def flush(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L264) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L21) -Create an instance implementing the `State` protocol to be provided - -to `StreamingDataFrame` functions. -All operations called on this State object will be prefixed with -the supplied `prefix`. +This method is triggered by the Checkpoint class when it commits. -**Returns**: +You can use `flush()` to write the batched data to the destination (in case of +a batching sink), or confirm the delivery of the previously sent messages +(in case of a streaming sink). -an instance implementing the `State` protocol +If flush() fails, the checkpoint will be aborted. - + -#### PartitionTransaction.get +#### BaseSink.add ```python -@validate_transaction_status(PartitionTransactionStatus.STARTED) -def get(key: Any, - prefix: bytes, - default: Any = None, - cf_name: str = "default") -> Optional[Any] +@abc.abstractmethod +def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, + topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L283) - -Get a key from the store. - -It returns `None` if the key is not found and `default` is not provided. - -**Arguments**: - -- `key`: key -- `prefix`: a key prefix -- `default`: default value to return if the key is not found -- `cf_name`: column family name +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L33) -**Returns**: +This method is triggered on every new processed record being sent to this sink. -value or None if the key is not found and `default` is not provided +You can use it to accumulate batches of data before sending them outside, or +to send results right away in a streaming manner and confirm a delivery later +on flush(). - + -#### PartitionTransaction.set +#### BaseSink.on\_paused ```python -@validate_transaction_status(PartitionTransactionStatus.STARTED) -def set(key: Any, value: Any, prefix: bytes, cf_name: str = "default") +def on_paused(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L319) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L51) -Set value for the key. - -**Arguments**: +This method is triggered when the sink is paused due to backpressure, when +the `SinkBackpressureError` is raised. -- `key`: key -- `prefix`: a key prefix -- `value`: value -- `cf_name`: column family name +Here you can react to the backpressure events. - + -#### PartitionTransaction.delete +### BatchingSink ```python -@validate_transaction_status(PartitionTransactionStatus.STARTED) -def delete(key: Any, prefix: bytes, cf_name: str = "default") +class BatchingSink(BaseSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L342) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L60) -Delete value for the key. +A base class for batching sinks, that need to accumulate the data first before +sending it to the external destinatios. -This function always returns `None`, even if value is not found. +Examples: databases, objects stores, and other destinations where +writing every message is not optimal. -**Arguments**: +It automatically handles batching, keeping batches in memory per topic-partition. -- `key`: key -- `prefix`: a key prefix -- `cf_name`: column family name +You may subclass it and override the `write()` method to implement a custom +batching sink. - + -#### PartitionTransaction.exists +#### BatchingSink.write ```python -@validate_transaction_status(PartitionTransactionStatus.STARTED) -def exists(key: Any, prefix: bytes, cf_name: str = "default") -> bool +@abc.abstractmethod +def write(batch: SinkBatch) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L361) - -Check if the key exists in state. - -**Arguments**: - -- `key`: key -- `prefix`: a key prefix -- `cf_name`: column family name +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L83) -**Returns**: +This method implements actual writing to the external destination. -True if key exists, False otherwise +It may also raise `SinkBackpressureError` if the destination cannot accept new +writes at the moment. +When this happens, the accumulated batch is dropped and the app pauses the +corresponding topic partition. - + -#### PartitionTransaction.prepare +#### BatchingSink.add ```python -@validate_transaction_status(PartitionTransactionStatus.STARTED) -def prepare(processed_offset: Optional[int]) +def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, + topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L381) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L93) -Produce changelog messages to the changelog topic for all changes accumulated +Add a new record to in-memory batch. -in this transaction and prepare transaction to flush its state to the state -store. + -After successful `prepare()`, the transaction status is changed to PREPARED, -and it cannot receive updates anymore. +#### BatchingSink.flush -If changelog is disabled for this application, no updates will be produced -to the changelog topic. +```python +def flush(topic: str, partition: int) +``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L115) -- `processed_offset`: the offset of the latest processed message +Flush an accumulated batch to the destination and drop it afterward. - + -#### PartitionTransaction.flush +#### BatchingSink.on\_paused ```python -@validate_transaction_status(PartitionTransactionStatus.STARTED, - PartitionTransactionStatus.PREPARED) -def flush(processed_offset: Optional[int] = None, - changelog_offset: Optional[int] = None) +def on_paused(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/transaction.py#L442) - -Flush the recent updates to the database. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L135) -It writes the WriteBatch to RocksDB and marks itself as finished. +When the destination is already backpressure, drop the accumulated batch. -If writing fails, the transaction is marked as failed and -cannot be used anymore. + ->***NOTE:*** If no keys have been modified during the transaction - (i.e. no "set" or "delete" have been called at least once), it will - not flush ANY data to the database including the offset to optimize - I/O. +## quixstreams.sinks.core -**Arguments**: + -- `processed_offset`: offset of the last processed message, optional. -- `changelog_offset`: offset of the last produced changelog message, -optional. +## quixstreams.sinks.core.csv - + -## quixstreams.state.base +### CSVSink - +```python +class CSVSink(BatchingSink) +``` -## quixstreams.state.base.state +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/csv.py#L9) - + -### State +#### CSVSink.\_\_init\_\_ ```python -class State(ABC) +def __init__(path: str, + dialect: str = "excel", + key_serializer: Callable[[Any], str] = str, + value_serializer: Callable[[Any], str] = json.dumps) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/csv.py#L10) -Primary interface for working with key-value state data from `StreamingDataFrame` +A base CSV sink that writes data from all assigned partitions to a single file. - +It's best to be used for local debugging. -#### State.get +Column format: + (key, value, timestamp, topic, partition, offset) -```python -@abstractmethod -def get(key: Any, default: Any = None) -> Optional[Any] -``` +**Arguments**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L19) +- `path`: a path to CSV file +- `dialect`: a CSV dialect to use. It affects quoting and delimiters. +See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info. +Default - `"excel"`. +- `key_serializer`: a callable to convert keys to strings. +Default - `str`. +- `value_serializer`: a callable to convert values to strings. +Default - `json.dumps`. -Get the value for key if key is present in the state, else default + -**Arguments**: +## quixstreams.sinks.core.influxdb3 -- `key`: key -- `default`: default value to return if the key is not found + -**Returns**: +### InfluxDB3Sink -value or None if the key is not found and `default` is not provided +```python +class InfluxDB3Sink(BatchingSink) +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/influxdb3.py#L23) -#### State.set + + +#### InfluxDB3Sink.\_\_init\_\_ ```python -@abstractmethod -def set(key: Any, value: Any) +def __init__(token: str, + host: str, + organization_id: str, + database: str, + measurement: str, + fields_keys: Iterable[str] = (), + tags_keys: Iterable[str] = (), + time_key: Optional[str] = None, + time_precision: WritePrecision = WritePrecision.MS, + include_metadata_tags: bool = False, + batch_size: int = 1000, + enable_gzip: bool = True, + request_timeout_ms: int = 10_000, + debug: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L30) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/influxdb3.py#L24) -Set value for the key. +A connector to sink processed data to InfluxDB v3. -**Arguments**: +It batches the processed records in memory per topic partition, converts +them to the InfluxDB format, and flushes them to InfluxDB at the checkpoint. -- `key`: key -- `value`: value +The InfluxDB sink transparently handles backpressure if the destination instance +cannot accept more data at the moment +(e.g., when InfluxDB returns an HTTP 429 error with the "retry_after" header set). +When this happens, the sink will notify the Application to pause consuming +from the backpressured topic partition until the "retry_after" timeout elapses. - +>***NOTE***: InfluxDB3Sink can accept only dictionaries. +> If the record values are not dicts, you need to convert them to dicts before +> sinking. -#### State.delete +**Arguments**: -```python -@abstractmethod -def delete(key: Any) -``` +- `token`: InfluxDB access token +- `host`: InfluxDB host in format "https://" +- `organization_id`: InfluxDB organization_id +- `database`: database name +- `fields_keys`: a list of keys to be used as "fields" when writing to InfluxDB. +If present, it must not overlap with "tags_keys". +If empty, the whole record value will be used. +>***NOTE*** The fields' values can only be strings, floats, integers, or booleans. +Default - `()`. +- `tags_keys`: a list of keys to be used as "tags" when writing to InfluxDB. +If present, it must not overlap with "fields_keys". +These keys will be popped from the value dictionary +automatically because InfluxDB doesn't allow the same keys be +both in tags and fields. +If empty, no tags will be sent. +>***NOTE***: InfluxDB client always converts tag values to strings. +Default - `()`. +- `time_key`: a key to be used as "time" when writing to InfluxDB. +By default, the record timestamp will be used with "ms" time precision. +When using a custom key, you may need to adjust the `time_precision` setting +to match. +- `time_precision`: a time precision to use when writing to InfluxDB. +- `include_metadata_tags`: if True, includes record's key, topic, +and partition as tags. +Default - `False`. +- `batch_size`: how many records to write to InfluxDB in one request. +Note that it only affects the size of one write request, and not the number +of records flushed on each checkpoint. +Default - `1000`. +- `enable_gzip`: if True, enables gzip compression for writes. +Default - `True`. +- `request_timeout_ms`: an HTTP request timeout in milliseconds. +Default - `10000`. +- `debug`: if True, print debug logs from InfluxDB client. +Default - `False`. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L39) + -Delete value for the key. +## quixstreams.sinks.community -This function always returns `None`, even if value is not found. +This module contains Sinks developed and maintained by the members of Quix Streams community. -**Arguments**: + -- `key`: key +## quixstreams.sinks.community.file.formats.base - + -#### State.exists +### Format ```python -@abstractmethod -def exists(key: Any) -> bool +class Format(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L49) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L8) -Check if the key exists in state. +Base class for formatting batches in file sinks. -**Arguments**: +This abstract base class defines the interface for batch formatting +in file sinks. Subclasses should implement the `file_extension` +property and the `serialize` method to define how batches are +formatted and saved. -- `key`: key + -**Returns**: +#### Format.file\_extension -True if key exists, False otherwise +```python +@property +@abstractmethod +def file_extension() -> str +``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L20) -### TransactionState +Returns the file extension used for output files. -```python -class TransactionState(State) -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L58) +The file extension as a string. - + -#### TransactionState.\_\_init\_\_ +#### Format.supports\_append ```python -def __init__(prefix: bytes, transaction: "PartitionTransaction") +@property +@abstractmethod +def supports_append() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L64) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L30) -Simple key-value state to be provided into `StreamingDataFrame` functions +Indicates if the format supports appending data to an existing file. -**Arguments**: +**Returns**: -- `transaction`: instance of `PartitionTransaction` +True if appending is supported, otherwise False. - + -#### TransactionState.get +#### Format.serialize ```python -def get(key: Any, default: Any = None) -> Optional[Any] +@abstractmethod +def serialize(batch: SinkBatch) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L73) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L39) -Get the value for key if key is present in the state, else default +Serializes a batch of messages into bytes. **Arguments**: -- `key`: key -- `default`: default value to return if the key is not found +- `batch`: The batch of messages to serialize. **Returns**: -value or None if the key is not found and `default` is not provided +The serialized batch as bytes. - + -#### TransactionState.set +## quixstreams.sinks.community.file.formats.parquet + + + +### ParquetFormat ```python -def set(key: Any, value: Any) +class ParquetFormat(Format) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L16) -Set value for the key. +Serializes batches of messages into Parquet format. -**Arguments**: +This class provides functionality to serialize a `SinkBatch` into bytes +in Parquet format using PyArrow. It allows setting the file extension +and compression algorithm used for the Parquet files. -- `key`: key -- `value`: value +This format does not support appending to existing files. - + -#### TransactionState.delete +#### ParquetFormat.\_\_init\_\_ ```python -def delete(key: Any) +def __init__(file_extension: str = ".parquet", + compression: Compression = "snappy") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L91) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L29) -Delete value for the key. - -This function always returns `None`, even if value is not found. +Initializes the ParquetFormat. **Arguments**: -- `key`: key +- `file_extension`: The file extension to use for output files. +Defaults to ".parquet". +- `compression`: The compression algorithm to use for Parquet files. +Allowed values are "none", "snappy", "gzip", "brotli", "lz4", +or "zstd". Defaults to "snappy". - + -#### TransactionState.exists +#### ParquetFormat.file\_extension ```python -def exists(key: Any) -> bool +@property +def file_extension() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L100) - -Check if the key exists in state. - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L47) -- `key`: key +Returns the file extension used for output files. **Returns**: -True if key exists, False otherwise +The file extension as a string. - + -## quixstreams.exceptions +#### ParquetFormat.serialize - +```python +def serialize(batch: SinkBatch) -> bytes +``` -## quixstreams.exceptions.assignment +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L55) - +Serializes a `SinkBatch` into bytes in Parquet format. -### PartitionAssignmentError +Each item in the batch is converted into a dictionary with "_timestamp", +"_key", and the keys from the message value. If the message key is in +bytes, it is decoded to a string. -```python -class PartitionAssignmentError(QuixException) -``` +Missing fields in messages are filled with `None` to ensure all rows +have the same columns. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/exceptions/assignment.py#L6) +**Arguments**: -Error happened during partition rebalancing. -Raised from `on_assign`, `on_revoke` and `on_lost` callbacks +- `batch`: The `SinkBatch` to serialize. - +**Returns**: -## quixstreams.exceptions.base +The serialized batch as bytes in Parquet format. - + -## quixstreams.context +## quixstreams.sinks.community.file.formats - + -#### set\_message\_context +### InvalidFormatError ```python -def set_message_context(context: Optional[MessageContext]) +class InvalidFormatError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/context.py#L22) - -Set a MessageContext for the current message in the given `contextvars.Context` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/__init__.py#L17) ->***NOTE:*** This is for advanced usage only. If you need to change the message key, -`StreamingDataFrame.to_topic()` has an argument for it. +Raised when the format is specified incorrectly. + -Example Snippet: +#### resolve\_format ```python -from quixstreams import Application, set_message_context, message_context +def resolve_format(format: Union[FormatName, Format]) -> Format +``` -# Changes the current sdf value based on what the message partition is. -def alter_context(value): - context = message_context() - if value > 1: - context.headers = context.headers + (b"cool_new_header", value.encode()) - set_message_context(context) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/__init__.py#L23) -app = Application() -sdf = app.dataframe() -sdf = sdf.update(lambda value: alter_context(value)) -``` +Resolves the format into a `Format` instance. **Arguments**: -- `context`: instance of `MessageContext` +- `format`: The format to resolve, either a format name ("json", +"parquet") or a `Format` instance. - +**Raises**: -#### message\_context +- `InvalidFormatError`: If the format name is invalid. -```python -def message_context() -> Optional[MessageContext] -``` +**Returns**: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/context.py#L53) +An instance of `Format` corresponding to the specified format. -Get a MessageContext for the current message, which houses most of the message + -metadata, like: - - key - - timestamp - - partition - - offset +## quixstreams.sinks.community.file.formats.json + -Example Snippet: +### JSONFormat ```python -from quixstreams import Application, message_context - -# Changes the current sdf value based on what the message partition is. - -app = Application() -sdf = app.dataframe() -sdf = sdf.apply(lambda value: 1 if message_context().partition == 2 else 0) +class JSONFormat(Format) ``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L14) -instance of `MessageContext` +Serializes batches of messages into JSON Lines format with optional gzip +compression. - +This class provides functionality to serialize a `SinkBatch` into bytes +in JSON Lines format. It supports optional gzip compression and allows +for custom JSON serialization through the `dumps` parameter. -## quixstreams.kafka.configuration +This format supports appending to existing files. - + -### ConnectionConfig +#### JSONFormat.\_\_init\_\_ ```python -class ConnectionConfig(BaseSettings) +def __init__(file_extension: str = ".jsonl", + compress: bool = False, + dumps: Optional[Callable[[Any], str]] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/configuration.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L28) -Provides an interface for all librdkafka connection-based configs. +Initializes the JSONFormat. -Allows converting to or from a librdkafka dictionary. +**Arguments**: -Also obscures secrets and handles any case sensitivity issues. +- `file_extension`: The file extension to use for output files. +Defaults to ".jsonl". +- `compress`: If `True`, compresses the output using gzip and +appends ".gz" to the file extension. Defaults to `False`. +- `dumps`: A custom function to serialize objects to JSON-formatted +strings. If provided, the `compact` option is ignored. - + -#### ConnectionConfig.settings\_customise\_sources +#### JSONFormat.file\_extension ```python -@classmethod -def settings_customise_sources( - cls, settings_cls: Type[PydanticBaseSettings], - init_settings: PydanticBaseSettingsSource, - env_settings: PydanticBaseSettingsSource, - dotenv_settings: PydanticBaseSettingsSource, - file_secret_settings: PydanticBaseSettingsSource -) -> Tuple[PydanticBaseSettingsSource, ...] +@property +def file_extension() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/configuration.py#L99) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L57) -Included to ignore reading/setting values from the environment +Returns the file extension used for output files. - +**Returns**: -#### ConnectionConfig.from\_librdkafka\_dict +The file extension as a string. + + + +#### JSONFormat.serialize ```python -@classmethod -def from_librdkafka_dict(cls, - config: dict, - ignore_extras: bool = False) -> Self +def serialize(batch: SinkBatch) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/configuration.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L65) -Create a `ConnectionConfig` from a librdkafka config dictionary. +Serializes a `SinkBatch` into bytes in JSON Lines format. + +Each item in the batch is converted into a JSON object with +"_timestamp", "_key", and "_value" fields. If the message key is +in bytes, it is decoded to a string. **Arguments**: -- `config`: a dict of configs (like {"bootstrap.servers": "url"}) -- `ignore_extras`: Ignore non-connection settings (else raise exception) +- `batch`: The `SinkBatch` to serialize. **Returns**: -a ConnectionConfig +The serialized batch in JSON Lines format, optionally +compressed with gzip. - + -#### ConnectionConfig.as\_librdkafka\_dict +## quixstreams.sinks.community.file.destinations.base + + + +### Destination ```python -def as_librdkafka_dict(plaintext_secrets: bool = True) -> dict +class Destination(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/configuration.py#L128) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L16) -Dump any non-empty config values as a librdkafka dictionary. +Abstract base class for defining where and how data should be stored. ->***NOTE***: All secret values will be dumped in PLAINTEXT by default. +Destinations handle the storage of serialized data, whether that's to local +disk, cloud storage, or other locations. They manage the physical writing of +data while maintaining a consistent directory/path structure based on topics +and partitions. -**Arguments**: + -- `plaintext_secrets`: whether secret values are plaintext or obscured (***) +#### Destination.set\_directory -**Returns**: +```python +def set_directory(directory: str) -> None +``` -a librdkafka-compatible dictionary +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L28) - +Configure the base directory for storing files. -## quixstreams.kafka +**Arguments**: - +- `directory`: The base directory path where files will be stored. -## quixstreams.kafka.producer +**Raises**: - +- `ValueError`: If the directory path contains invalid characters. +Only alphanumeric characters (a-zA-Z0-9), spaces, dots, and +underscores are allowed. -### Producer + + +#### Destination.set\_extension ```python -class Producer() +def set_extension(format: Format) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L42) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L45) - +Set the file extension based on the format. -#### Producer.\_\_init\_\_ +**Arguments**: + +- `format`: The Format instance that defines the file extension. + + + +#### Destination.write ```python -def __init__(broker_address: Union[str, ConnectionConfig], - logger: logging.Logger = logger, - error_callback: Callable[[KafkaError], None] = _default_error_cb, - extra_config: Optional[dict] = None, - flush_timeout: Optional[float] = None) +@abstractmethod +def write(data: bytes, batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L54) -A wrapper around `confluent_kafka.Producer`. - -It initializes `confluent_kafka.Producer` on demand -avoiding network calls during `__init__`, provides typing info for methods -and some reasonable defaults. +Write the serialized data to storage. **Arguments**: -- `broker_address`: Connection settings for Kafka. -Accepts string with Kafka broker host and port formatted as `:`, -or a ConnectionConfig object if authentication is required. -- `logger`: a Logger instance to attach librdkafka logging to -- `error_callback`: callback used for producer errors -- `extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Producer` as is. -Note: values passed as arguments override values in `extra_config`. -- `flush_timeout`: The time the producer is waiting for all messages to be delivered. +- `data`: The serialized data to write. +- `batch`: The batch information containing topic, partition and offset +details. - + -#### Producer.produce +## quixstreams.sinks.community.file.destinations.local -```python -def produce(topic: str, - value: Optional[Union[str, bytes]] = None, - key: Optional[Union[str, bytes]] = None, - headers: Optional[Headers] = None, - partition: Optional[int] = None, - timestamp: Optional[int] = None, - poll_timeout: float = 5.0, - buffer_error_max_tries: int = 3, - on_delivery: Optional[DeliveryCallback] = None) -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L81) +### LocalDestination -Produce a message to a topic. +```python +class LocalDestination(Destination) +``` -It also polls Kafka for callbacks before producing to minimize -the probability of `BufferError`. -If `BufferError` still happens, the method will poll Kafka with timeout -to free up the buffer and try again. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L15) -**Arguments**: +A destination that writes data to the local filesystem. -- `topic`: topic name -- `value`: message value -- `key`: message key -- `headers`: message headers -- `partition`: topic partition -- `timestamp`: message timestamp -- `poll_timeout`: timeout for `poll()` call in case of `BufferError` -- `buffer_error_max_tries`: max retries for `BufferError`. -Pass `0` to not retry after `BufferError`. -- `on_delivery`: the delivery callback to be triggered on `poll()` -for the produced message. +Handles writing data to local files with support for both creating new files +and appending to existing ones. - + -#### Producer.poll +#### LocalDestination.\_\_init\_\_ ```python -def poll(timeout: float = 0) +def __init__(append: bool = False) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L142) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L22) -Polls the producer for events and calls `on_delivery` callbacks. +Initialize the local destination. **Arguments**: -- `timeout`: poll timeout seconds; Default: 0 (unlike others) -> NOTE: -1 will hang indefinitely if there are no messages to acknowledge +- `append`: If True, append to existing files instead of creating new +ones. Defaults to False. - + -#### Producer.flush +#### LocalDestination.set\_extension ```python -def flush(timeout: Optional[float] = None) -> int +def set_extension(format: Format) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L150) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L32) -Wait for all messages in the Producer queue to be delivered. +Set the file extension and validate append mode compatibility. **Arguments**: -- `timeout` (`float`): time to attempt flushing (seconds). -None use producer default or -1 is infinite. Default: None +- `format`: The Format instance that defines the file extension. -**Returns**: +**Raises**: -number of messages remaining to flush +- `ValueError`: If append mode is enabled but the format doesn't +support appending. - + -### TransactionalProducer +#### LocalDestination.write ```python -class TransactionalProducer(Producer) +def write(data: bytes, batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/producer.py#L181) - -A separate producer class used only internally for transactions -(transactions are only needed when using a consumer). - - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L43) -## quixstreams.kafka.consumer +Write data to a local file. - +**Arguments**: -### BaseConsumer +- `data`: The serialized data to write. +- `batch`: The batch information containing topic and partition details. -```python -class BaseConsumer() -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L68) +## quixstreams.sinks.community.file.destinations.s3 - + -#### BaseConsumer.\_\_init\_\_ +### S3BucketNotFoundError ```python -def __init__(broker_address: Union[str, ConnectionConfig], - consumer_group: Optional[str], - auto_offset_reset: AutoOffsetReset, - auto_commit_enable: bool = True, - logger: logging.Logger = logger, - error_callback: Callable[[KafkaError], None] = _default_error_cb, - on_commit: Optional[Callable[ - [Optional[KafkaError], List[TopicPartition]], None]] = None, - extra_config: Optional[dict] = None) +class S3BucketNotFoundError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L69) - -A wrapper around `confluent_kafka.Consumer`. - -It initializes `confluent_kafka.Consumer` on demand -avoiding network calls during `__init__`, provides typing info for methods -and some reasonable defaults. - -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L13) -- `broker_address`: Connection settings for Kafka. -Accepts string with Kafka broker host and port formatted as `:`, -or a ConnectionConfig object if authentication is required. -- `consumer_group`: Kafka consumer group. -Passed as `group.id` to `confluent_kafka.Consumer` -- `auto_offset_reset`: Consumer `auto.offset.reset` setting. -Available values: -
"earliest" - automatically reset the offset to the smallest offset -
"latest" - automatically reset the offset to the largest offset -
"error" - trigger an error (`ERR__AUTO_OFFSET_RESET`) which is - retrieved by consuming messages (used for testing) -- `auto_commit_enable`: If true, periodically commit offset of -the last message handed to the application. Default - `True`. -- `logger`: a Logger instance to attach librdkafka logging to -- `error_callback`: callback used for consumer errors -- `on_commit`: Offset commit result propagation callback. -Passed as "offset_commit_cb" to `confluent_kafka.Consumer`. -- `extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Consumer` as is. -Note: values passed as arguments override values in `extra_config`. +Raised when the specified S3 bucket does not exist. - + -#### BaseConsumer.poll +### S3BucketAccessDeniedError ```python -def poll(timeout: Optional[float] = None) -> Optional[Message] +class S3BucketAccessDeniedError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L132) - -Consumes a single message, calls callbacks and returns events. - -The application must check the returned :py:class:`Message` -object's :py:func:`Message.error()` method to distinguish between proper -messages (error() returns None), or an event or error. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L17) -Note: a `RebalancingCallback` may be called from this method ( -`on_assign`, `on_revoke`, or `on_lost`). +Raised when the specified S3 bucket access is denied. -**Arguments**: + -- `timeout` (`float`): Maximum time in seconds to block waiting for message, -event or callback. None or -1 is infinite. Default: None. +### S3Destination -**Raises**: +```python +class S3Destination(Destination) +``` -- `RuntimeError`: if called on a closed consumer +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L21) -**Returns**: +A destination that writes data to Amazon S3. -`Optional[Message]`: A `Message` object or `None` on timeout +Handles writing data to S3 buckets using the AWS SDK. Credentials can be +provided directly or via environment variables. - + -#### BaseConsumer.unsubscribe +#### S3Destination.\_\_init\_\_ ```python -def unsubscribe() +def __init__(bucket: str, + aws_access_key_id: Optional[str] = getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key: Optional[str] = getenv( + "AWS_SECRET_ACCESS_KEY"), + region_name: Optional[str] = getenv("AWS_REGION", + getenv("AWS_DEFAULT_REGION")), + **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L235) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L28) -Remove current subscription. +Initialize the S3 destination. + +**Arguments**: + +- `bucket`: Name of the S3 bucket to write to. +- `aws_access_key_id`: AWS access key ID. Defaults to AWS_ACCESS_KEY_ID +environment variable. +- `aws_secret_access_key`: AWS secret access key. Defaults to +AWS_SECRET_ACCESS_KEY environment variable. +- `region_name`: AWS region name. Defaults to AWS_REGION or +AWS_DEFAULT_REGION environment variable. +- `kwargs`: Additional keyword arguments passed to boto3.client. **Raises**: -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer +- `S3BucketNotFoundError`: If the specified bucket doesn't exist. +- `S3BucketAccessDeniedError`: If access to the bucket is denied. - + -#### BaseConsumer.store\_offsets +#### S3Destination.write ```python -def store_offsets(message: Optional[Message] = None, - offsets: Optional[List[TopicPartition]] = None) +def write(data: bytes, batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L244) - -Store offsets for a message or a list of offsets. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L78) -`message` and `offsets` are mutually exclusive. The stored offsets -will be committed according to 'auto.commit.interval.ms' or manual -offset-less `commit`. -Note that 'enable.auto.offset.store' must be set to False when using this API. +Write data to S3. **Arguments**: -- `message` (`confluent_kafka.Message`): Store message's offset+1. -- `offsets` (`List[TopicPartition]`): List of topic+partitions+offsets to store. +- `data`: The serialized data to write. +- `batch`: The batch information containing topic and partition details. -**Raises**: + -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer +## quixstreams.sinks.community.file.destinations - + -#### BaseConsumer.commit +## quixstreams.sinks.community.file.sink + + + +### FileSink ```python -def commit(message: Optional[Message] = None, - offsets: Optional[List[TopicPartition]] = None, - asynchronous: bool = True) -> Optional[List[TopicPartition]] +class FileSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L275) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/sink.py#L11) -Commit a message or a list of offsets. +A sink that writes data batches to files using configurable formats and +destinations. -The `message` and `offsets` parameters are mutually exclusive. -If neither is set, the current partition assignment's offsets are used instead. -Use this method to commit offsets if you have 'enable.auto.commit' set to False. +The sink groups messages by their topic and partition, ensuring data from the +same source is stored together. Each batch is serialized using the specified +format (e.g., JSON, Parquet) before being written to the configured +destination. -**Arguments**: - -- `message` (`Message`): Commit the message's offset+1. -Note: By convention, committed offsets reflect the next message -to be consumed, **not** the last message consumed. -- `offsets` (`List[TopicPartition]`): List of topic+partitions+offsets to commit. -- `asynchronous` (`bool`): If true, asynchronously commit, returning None -immediately. If False, the commit() call will block until the commit -succeeds or fails and the committed offsets will be returned (on success). -Note that specific partitions may have failed and the .err field of -each partition should be checked for success. - -**Raises**: - -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer +The destination determines the storage location and write behavior. By default, +it uses LocalDestination for writing to the local filesystem, but can be +configured to use other storage backends (e.g., cloud storage). - + -#### BaseConsumer.committed +#### FileSink.\_\_init\_\_ ```python -def committed(partitions: List[TopicPartition], - timeout: Optional[float] = None) -> List[TopicPartition] +def __init__(directory: str = "", + format: Union[FormatName, Format] = "json", + destination: Optional[Destination] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L316) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/sink.py#L25) -Retrieve committed offsets for the specified partitions. +Initialize the FileSink with the specified configuration. **Arguments**: -- `partitions` (`List[TopicPartition]`): List of topic+partitions to query for stored offsets. -- `timeout` (`float`): Request timeout (seconds). -None or -1 is infinite. Default: None - -**Raises**: - -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer - -**Returns**: - -`List[TopicPartition]`: List of topic+partitions with offset and possibly error set. +- `directory`: Base directory path for storing files. Defaults to +current directory. +- `format`: Data serialization format, either as a string +("json", "parquet") or a Format instance. +- `destination`: Storage destination handler. Defaults to +LocalDestination if not specified. - + -#### BaseConsumer.get\_watermark\_offsets +#### FileSink.write ```python -def get_watermark_offsets(partition: TopicPartition, - timeout: Optional[float] = None, - cached: bool = False) -> Tuple[int, int] +def write(batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L334) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/sink.py#L46) -Retrieve low and high offsets for the specified partition. +Write a batch of data using the configured format and destination. + +The method performs the following steps: +1. Serializes the batch data using the configured format +2. Writes the serialized data to the destination +3. Handles any write failures by raising a backpressure error **Arguments**: -- `partition` (`TopicPartition`): Topic+partition to return offsets for. -- `timeout` (`float`): Request timeout (seconds). None or -1 is infinite. -Ignored if cached=True. Default: None -- `cached` (`bool`): Instead of querying the broker, use cached information. -Cached values: The low offset is updated periodically -(if statistics.interval.ms is set) while the high offset is updated on each -message fetched from the broker for this partition. +- `batch`: The batch of data to write. **Raises**: -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer +- `SinkBackpressureError`: If the write operation fails, indicating +that the sink needs backpressure with a 5-second retry delay. -**Returns**: + -`Tuple[int, int]`: Tuple of (low,high) on success or None on timeout. -The high offset is the offset of the last message + 1. +## quixstreams.sinks.community.file - + -#### BaseConsumer.list\_topics +## quixstreams.sinks.community.iceberg + + + +### AWSIcebergConfig ```python -def list_topics(topic: Optional[str] = None, - timeout: Optional[float] = None) -> ClusterMetadata +class AWSIcebergConfig(BaseIcebergConfig) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L360) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L42) -Request metadata from the cluster. + -This method provides the same information as -listTopics(), describeTopics() and describeCluster() in the Java Admin client. +#### AWSIcebergConfig.\_\_init\_\_ -**Arguments**: +```python +def __init__(aws_s3_uri: str, + aws_region: Optional[str] = None, + aws_access_key_id: Optional[str] = None, + aws_secret_access_key: Optional[str] = None, + aws_session_token: Optional[str] = None) +``` -- `topic` (`str`): If specified, only request information about this topic, -else return results for all topics in cluster. -Warning: If auto.create.topics.enable is set to true on the broker and -an unknown topic is specified, it will be created. -- `timeout` (`float`): The maximum response time before timing out -None or -1 is infinite. Default: None +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L43) -**Raises**: +Configure IcebergSink to work with AWS Glue. -- `KafkaException`: if a Kafka-based error occurs +**Arguments**: - +- `aws_s3_uri`: The S3 URI where the table data will be stored +(e.g., 's3://your-bucket/warehouse/'). +- `aws_region`: The AWS region for the S3 bucket and Glue catalog. +- `aws_access_key_id`: the AWS access key ID. +NOTE: can alternatively set the AWS_ACCESS_KEY_ID environment variable +when using AWS Glue. +- `aws_secret_access_key`: the AWS secret access key. +NOTE: can alternatively set the AWS_SECRET_ACCESS_KEY environment variable +when using AWS Glue. +- `aws_session_token`: a session token (or will be generated for you). +NOTE: can alternatively set the AWS_SESSION_TOKEN environment variable when +using AWS Glue. -#### BaseConsumer.memberid + + +### IcebergSink ```python -def memberid() -> Optional[str] +class IcebergSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L381) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L76) -Return this client's broker-assigned group member id. +IcebergSink writes batches of data to an Apache Iceberg table. -The member id is assigned by the group coordinator and is propagated to -the consumer during rebalance. +The data will by default include the kafka message key, value, and timestamp. -**Raises**: +It serializes incoming data batches into Parquet format and appends them to the +Iceberg table, updating the table schema as necessary. -- `RuntimeError`: if called on a closed consumer +Currently, supports Apache Iceberg hosted in: -**Returns**: +- AWS -`Optional[string]`: Member id string or None +Supported data catalogs: - +- AWS Glue -#### BaseConsumer.offsets\_for\_times +**Arguments**: -```python -def offsets_for_times(partitions: List[TopicPartition], - timeout: Optional[float] = None) -> List[TopicPartition] -``` +- `table_name`: The name of the Iceberg table. +- `config`: An IcebergConfig with all the various connection parameters. +- `data_catalog_spec`: data cataloger to use (ex. for AWS Glue, "aws_glue"). +- `schema`: The Iceberg table schema. If None, a default schema is used. +- `partition_spec`: The partition specification for the table. +If None, a default is used. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L394) +Example setup using an AWS-hosted Iceberg with AWS Glue: -Look up offsets by timestamp for the specified partitions. +``` +from quixstreams import Application +from quixstreams.sinks.community.iceberg import IcebergSink, AWSIcebergConfig -The returned offset for each partition is the earliest offset whose -timestamp is greater than or equal to the given timestamp in the -corresponding partition. If the provided timestamp exceeds that of the -last message in the partition, a value of -1 will be returned. +# Configure S3 bucket credentials +iceberg_config = AWSIcebergConfig( + aws_s3_uri="", aws_region="", aws_access_key_id="", aws_secret_access_key="" +) -**Arguments**: +# Configure the sink to write data to S3 with the AWS Glue catalog spec +iceberg_sink = IcebergSink( + table_name="glue.sink-test", + config=iceberg_config, + data_catalog_spec="aws_glue", +) -- `partitions` (`List[TopicPartition]`): topic+partitions with timestamps -in the TopicPartition.offset field. -- `timeout` (`float`): The maximum response time before timing out. -None or -1 is infinite. Default: None +app = Application(broker_address='localhost:9092', auto_offset_reset="earliest") +topic = app.topic('sink_topic') -**Raises**: +# Do some processing here +sdf = app.dataframe(topic=topic).print(metadata=True) -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer +# Sink results to the IcebergSink +sdf.sink(iceberg_sink) -**Returns**: -`List[TopicPartition]`: List of topic+partition with offset field set and possibly error set +if __name__ == "__main__": + # Start the application + app.run() +``` - + -#### BaseConsumer.pause +#### IcebergSink.write ```python -def pause(partitions: List[TopicPartition]) +def write(batch: SinkBatch) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L420) - -Pause consumption for the provided list of partitions. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L174) -Paused partitions must be tracked manually. +Writes a batch of data to the Iceberg table. -Does NOT affect the result of `Consumer.assignment()`. +Implements retry logic to handle concurrent write conflicts. **Arguments**: -- `partitions` (`List[TopicPartition]`): List of topic+partitions to pause. +- `batch`: The batch of data to write. -**Raises**: + -- `KafkaException`: if a Kafka-based error occurs +## quixstreams.sinks.community.redis - + -#### BaseConsumer.resume +### RedisSink ```python -def resume(partitions: List[TopicPartition]) +class RedisSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L433) - -Resume consumption for the provided list of partitions. - -**Arguments**: - -- `partitions` (`List[TopicPartition]`): List of topic+partitions to resume. - -**Raises**: - -- `KafkaException`: if a Kafka-based error occurs +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/redis.py#L21) - + -#### BaseConsumer.position +#### RedisSink.\_\_init\_\_ ```python -def position(partitions: List[TopicPartition]) -> List[TopicPartition] +def __init__(host: str, + port: int, + db: int, + value_serializer: Callable[[Any], Union[bytes, str]] = json.dumps, + key_serializer: Optional[Callable[[Any, Any], Union[bytes, + str]]] = None, + password: Optional[str] = None, + socket_timeout: float = 30.0, + **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L443) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/redis.py#L22) -Retrieve current positions (offsets) for the specified partitions. +A connector to sink processed data to Redis. -**Arguments**: +It batches the processed records in memory per topic partition, and flushes them to Redis at the checkpoint. -- `partitions` (`List[TopicPartition]`): List of topic+partitions to return -current offsets for. The current offset is the offset of -the last consumed message + 1. +**Arguments**: -**Raises**: +- `host`: Redis host. +- `port`: Redis port. +- `db`: Redis DB number. +- `value_serializer`: a callable to serialize the value to string or bytes +(defaults to json.dumps). +- `key_serializer`: an optional callable to serialize the key to string or bytes. +If not provided, the Kafka message key will be used as is. +- `password`: Redis password, optional. +- `socket_timeout`: Redis socket timeout. +Default - 30s. +- `kwargs`: Additional keyword arguments passed to the `redis.Redis` instance. -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer + -**Returns**: +## quixstreams.sinks.community.bigquery -`List[TopicPartition]`: List of topic+partitions with offset and possibly error set. + - +### BigQuerySink -#### BaseConsumer.seek +```python +class BigQuerySink(BatchingSink) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/bigquery.py#L53) + + + +#### BigQuerySink.\_\_init\_\_ ```python -def seek(partition: TopicPartition) +def __init__(project_id: str, + location: str, + dataset_id: str, + table_name: str, + service_account_json: Optional[str] = None, + schema_auto_update: bool = True, + ddl_timeout: float = 10.0, + insert_timeout: float = 10.0, + retry_timeout: float = 30.0, + **kwargs) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L457) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/bigquery.py#L54) -Set consume position for partition to offset. +A connector to sink processed data to Google Cloud BigQuery. -The offset may be an absolute (>=0) or a -logical offset like `OFFSET_BEGINNING`. +It batches the processed records in memory per topic partition, and flushes them to BigQuery at the checkpoint. -`seek()` may only be used to update the consume offset of an -actively consumed partition (i.e., after `Consumer.assign()`), -to set the starting offset of partition not being consumed instead -pass the offset in an `assign()` call. +>***NOTE***: BigQuerySink can accept only dictionaries. +> If the record values are not dicts, you need to convert them to dicts before +> sinking. + +The column names and types are inferred from individual records. +Each key in the record's dictionary will be inserted as a column to the resulting BigQuery table. + +If the column is not present in the schema, the sink will try to add new nullable columns on the fly with types inferred from individual values. +The existing columns will not be affected. +To disable this behavior, pass `schema_auto_update=False` and define the necessary schema upfront. +The minimal schema must define two columns: "timestamp" of type TIMESTAMP, and "__key" with a type of the expected message key. **Arguments**: -- `partition` (`TopicPartition`): Topic+partition+offset to seek to. +- `project_id`: a Google project id. +- `location`: a BigQuery location. +- `dataset_id`: a BigQuery dataset id. +If the dataset does not exist, the sink will try to create it. +- `table_name`: BigQuery table name. +If the table does not exist, the sink will try to create it with a default schema. +- `service_account_json`: an optional JSON string with service account credentials +to connect to BigQuery. +The internal `google.cloud.bigquery.Client` will use the Application Default Credentials if not provided. +See https://cloud.google.com/docs/authentication/provide-credentials-adc for more info. +Default - `None`. +- `schema_auto_update`: if True, the sink will try to create a dataset and a table if they don't exist. +It will also add missing columns on the fly with types inferred from individual values. +- `ddl_timeout`: a timeout for a single DDL operation (adding tables, columns, etc.). +Default - 10s. +- `insert_timeout`: a timeout for a single INSERT operation. +Default - 10s. +- `retry_timeout`: a total timeout for each request to BigQuery API. +During this timeout, a request can be retried according +to the client's default retrying policy. +- `kwargs`: Additional keyword arguments passed to `bigquery.Client`. -**Raises**: + -- `KafkaException`: if a Kafka-based error occurs +## quixstreams.sinks.community.kinesis - + -#### BaseConsumer.assignment +### KinesisStreamNotFoundError ```python -def assignment() -> List[TopicPartition] +class KinesisStreamNotFoundError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L474) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L23) -Returns the current partition assignment. +Raised when the specified Kinesis stream does not exist. -**Raises**: + -- `KafkaException`: if a Kafka-based error occurs -- `RuntimeError`: if called on a closed consumer +### KinesisSink -**Returns**: +```python +class KinesisSink(BaseSink) +``` -`List[TopicPartition]`: List of assigned topic+partitions. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L27) - + -#### BaseConsumer.set\_sasl\_credentials +#### KinesisSink.\_\_init\_\_ ```python -def set_sasl_credentials(username: str, password: str) +def __init__(stream_name: str, + aws_access_key_id: Optional[str] = getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key: Optional[str] = getenv( + "AWS_SECRET_ACCESS_KEY"), + region_name: Optional[str] = getenv("AWS_REGION", + getenv("AWS_DEFAULT_REGION")), + value_serializer: Callable[[Any], str] = json.dumps, + key_serializer: Callable[[Any], str] = bytes.decode, + **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L487) - -Sets the SASL credentials used for this client. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L28) -These credentials will overwrite the old ones, and will be used the next -time the client needs to authenticate. -This method will not disconnect existing broker connections that have been -established with the old credentials. -This method is applicable only to SASL PLAIN and SCRAM mechanisms. +Initialize the KinesisSink. **Arguments**: -- `username` (`str`): your username -- `password` (`str`): your password +- `stream_name`: Kinesis stream name. +- `aws_access_key_id`: AWS access key ID. +- `aws_secret_access_key`: AWS secret access key. +- `region_name`: AWS region name (e.g., 'us-east-1'). +- `value_serializer`: Function to serialize the value to string +(defaults to json.dumps). +- `key_serializer`: Function to serialize the key to string +(defaults to bytes.decode). +- `kwargs`: Additional keyword arguments passed to boto3.client. - + -#### BaseConsumer.incremental\_assign +#### KinesisSink.add ```python -def incremental_assign(partitions: List[TopicPartition]) +def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, + topic: str, partition: int, offset: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L501) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L80) -Assign new partitions. - -Can be called outside the `Consumer` `on_assign` callback (multiple times). -Partitions immediately show on `Consumer.assignment()`. - -Any additional partitions besides the ones passed during the `Consumer` -`on_assign` callback will NOT be associated with the consumer group. - -**Arguments**: +Buffer a record for the Kinesis stream. -- `partitions` (`List[TopicPartition]`): a list of topic partitions +Records are buffered until the batch size reaches 500, at which point +they are sent immediately. If the batch size is less than 500, records +will be sent when the flush method is called. - + -#### BaseConsumer.incremental\_unassign +#### KinesisSink.flush ```python -def incremental_unassign(partitions: List[TopicPartition]) +def flush(topic: str, partition: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L515) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L110) -Revoke partitions. +Flush all buffered records for a given topic-partition. -Can be called outside an on_revoke callback. +This method sends any outstanding records that have not yet been sent +because the batch size was less than 500. It waits for all futures to +complete, ensuring that all records are successfully sent to the Kinesis +stream. -**Arguments**: + -- `partitions` (`List[TopicPartition]`): a list of topic partitions +## quixstreams.sinks.community.postgresql - + -#### BaseConsumer.close +### PostgreSQLSink ```python -def close() +class PostgreSQLSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L525) - -Close down and terminate the Kafka Consumer. - -Actions performed: - -- Stops consuming. -- Commits offsets, unless the consumer property 'enable.auto.commit' is set to False. -- Leaves the consumer group. - -Registered callbacks may be called from this method, -see `poll()` for more info. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/postgresql.py#L48) - + -#### BaseConsumer.consumer\_group\_metadata +#### PostgreSQLSink.\_\_init\_\_ ```python -def consumer_group_metadata() -> GroupMetadata +def __init__(host: str, + port: int, + dbname: str, + user: str, + password: str, + table_name: str, + schema_auto_update: bool = True, + **kwargs) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/kafka/consumer.py#L542) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/postgresql.py#L49) -Used by the producer during consumer offset sending for an EOS transaction. +A connector to sink topic data to PostgreSQL. - +**Arguments**: -## quixstreams.kafka.exceptions +- `host`: PostgreSQL server address. +- `port`: PostgreSQL server port. +- `dbname`: PostgreSQL database name. +- `user`: Database user name. +- `password`: Database user password. +- `table_name`: PostgreSQL table name. +- `schema_auto_update`: Automatically update the schema when new columns are detected. +- `ddl_timeout`: Timeout for DDL operations such as table creation or schema updates. +- `kwargs`: Additional parameters for `psycopg2.connect`. - + -## quixstreams.app +## quixstreams.sinks.community.pubsub - + -### Application +### PubSubTopicNotFoundError ```python -class Application() +class PubSubTopicNotFoundError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L75) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L25) -The main Application class. +Raised when the specified topic does not exist. -Typically, the primary object needed to get a kafka application up and running. + -Most functionality is explained the various methods, except for -"column assignment". +### PubSubSink +```python +class PubSubSink(BaseSink) +``` -What it Does: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L29) -- On init: - - Provides defaults or helper methods for commonly needed objects - - If `quix_sdk_token` is passed, configures the app to use the Quix Cloud. -- When executed via `.run()` (after setup): - - Initializes Topics and StreamingDataFrames - - Facilitates processing of Kafka messages with a `StreamingDataFrame` - - Handles all Kafka client consumer/producer responsibilities. +A sink that publishes messages to Google Cloud Pub/Sub. + -Example Snippet: +#### PubSubSink.\_\_init\_\_ ```python -from quixstreams import Application - -# Set up an `app = Application` and `sdf = StreamingDataFrame`; -# add some operations to `sdf` and then run everything. - -app = Application(broker_address='localhost:9092', consumer_group='group') -topic = app.topic('test-topic') -df = app.dataframe(topic) -df.apply(lambda value, context: print('New message', value)) +def __init__(project_id: str, + topic_id: str, + service_account_json: Optional[str] = None, + value_serializer: Callable[[Any], Union[bytes, str]] = json.dumps, + key_serializer: Callable[[Any], str] = bytes.decode, + flush_timeout: int = 5, + **kwargs) -> None +``` -app.run() +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L32) + +Initialize the PubSubSink. + +**Arguments**: + +- `project_id`: GCP project ID. +- `topic_id`: Pub/Sub topic ID. +- `service_account_json`: an optional JSON string with service account credentials +to connect to Pub/Sub. +The internal `PublisherClient` will use the Application Default Credentials if not provided. +See https://cloud.google.com/docs/authentication/provide-credentials-adc for more info. +Default - `None`. +- `value_serializer`: Function to serialize the value to string or bytes +(defaults to json.dumps). +- `key_serializer`: Function to serialize the key to string +(defaults to bytes.decode). +- `kwargs`: Additional keyword arguments passed to PublisherClient. + + + +#### PubSubSink.add + +```python +def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, + topic: str, partition: int, offset: int) -> None ``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L81) -#### Application.\_\_init\_\_ +Publish a message to Pub/Sub. + + + +#### PubSubSink.flush ```python -def __init__(broker_address: Optional[Union[str, ConnectionConfig]] = None, - *, - quix_sdk_token: Optional[str] = None, - consumer_group: Optional[str] = None, - auto_offset_reset: AutoOffsetReset = "latest", - commit_interval: float = 5.0, - commit_every: int = 0, - consumer_extra_config: Optional[dict] = None, - producer_extra_config: Optional[dict] = None, - state_dir: Union[str, Path] = Path("state"), - rocksdb_options: Optional[RocksDBOptionsType] = None, - on_consumer_error: Optional[ConsumerErrorCallback] = None, - on_processing_error: Optional[ProcessingErrorCallback] = None, - on_producer_error: Optional[ProducerErrorCallback] = None, - on_message_processed: Optional[MessageProcessedCallback] = None, - consumer_poll_timeout: float = 1.0, - producer_poll_timeout: float = 0.0, - loglevel: Optional[Union[int, LogLevel]] = "INFO", - auto_create_topics: bool = True, - use_changelog_topics: bool = True, - quix_config_builder: Optional[QuixKafkaConfigsBuilder] = None, - topic_manager: Optional[TopicManager] = None, - request_timeout: float = 30, - topic_create_timeout: float = 60, - processing_guarantee: ProcessingGuarantee = "at-least-once") +def flush(topic: str, partition: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L114) -**Arguments**: +Wait for all publish operations to complete successfully. -- `broker_address`: Connection settings for Kafka. -Used by Producer, Consumer, and Admin clients. -Accepts string with Kafka broker host and port formatted as `:`, -or a ConnectionConfig object if authentication is required. -Either this OR `quix_sdk_token` must be set to use `Application` (not both). -Takes priority over quix auto-configuration. -Linked Environment Variable: `Quix__Broker__Address`. -Default: `None` -- `quix_sdk_token`: If using the Quix Cloud, the SDK token to connect with. -Either this OR `broker_address` must be set to use Application (not both). -Linked Environment Variable: `Quix__Sdk__Token`. -Default: None (if not run on Quix Cloud) - >***NOTE:*** the environment variable is set for you in the Quix Cloud -- `consumer_group`: Kafka consumer group. -Passed as `group.id` to `confluent_kafka.Consumer`. -Linked Environment Variable: `Quix__Consumer__Group`. -Default - "quixstreams-default" (set during init) - >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. -- `commit_interval`: How often to commit the processed messages in seconds. -Default - 5.0. -- `commit_every`: Commit the checkpoint after processing N messages. -Use this parameter for more granular control of the commit schedule. -If the value is > 0, the application will commit the checkpoint after -processing the specified number of messages across all the assigned -partitions. -If the value is <= 0, only the `commit_interval` will be considered. -Default - 0. - >***NOTE:*** Only input offsets are counted, and the application - > may produce more results than the number of incoming messages. -- `auto_offset_reset`: Consumer `auto.offset.reset` setting -- `consumer_extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Consumer` as is. -- `producer_extra_config`: A dictionary with additional options that -will be passed to `confluent_kafka.Producer` as is. -- `state_dir`: path to the application state directory. -Default - `"state"`. -- `rocksdb_options`: RocksDB options. -If `None`, the default options will be used. -- `consumer_poll_timeout`: timeout for `RowConsumer.poll()`. Default - `1.0`s -- `producer_poll_timeout`: timeout for `RowProducer.poll()`. Default - `0`s. -- `on_message_processed`: a callback triggered when message is successfully -processed. -- `loglevel`: a log level for "quixstreams" logger. -Should be a string or None. -If `None` is passed, no logging will be configured. -You may pass `None` and configure "quixstreams" logger -externally using `logging` library. -Default - `"INFO"`. -- `auto_create_topics`: Create all `Topic`s made via Application.topic() -Default - `True` -- `use_changelog_topics`: Use changelog topics to back stateful operations -Default - `True` -- `topic_manager`: A `TopicManager` instance -- `request_timeout`: timeout (seconds) for REST-based requests -- `topic_create_timeout`: timeout (seconds) for topic create finalization -- `processing_guarantee`: Use "exactly-once" or "at-least-once" processing. -

***Error Handlers***
-To handle errors, `Application` accepts callbacks triggered when - exceptions occur on different stages of stream processing. If the callback - returns `True`, the exception will be ignored. Otherwise, the exception - will be propagated and the processing will eventually stop. -- `on_consumer_error`: triggered when internal `RowConsumer` fails -to poll Kafka or cannot deserialize a message. -- `on_processing_error`: triggered when exception is raised within -`StreamingDataFrame.process()`. -- `on_producer_error`: triggered when `RowProducer` fails to serialize -or to produce a message to Kafka. -

***Quix Cloud Parameters***
-- `quix_config_builder`: instance of `QuixKafkaConfigsBuilder` to be used -instead of the default one. -> NOTE: It is recommended to just use `quix_sdk_token` instead. + - +## quixstreams.sinks + + + +## quixstreams.sources.base.multiprocessing + + + +## quixstreams.sources.base + + + +## quixstreams.sources.base.exceptions + + + +### SourceException + +```python +class SourceException(Exception) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/exceptions.py#L9) + +Raised in the parent process when a source finish with an exception + + + +## quixstreams.sources.base.manager + + + +### SourceProcess + +```python +class SourceProcess(process) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L30) + +An implementation of the Source subprocess. + +It manages a source and its subprocess, handles the communication between the child and parent processes, +lifecycle, and error handling. + +Some methods are designed to be used from the parent process, and others from the child process. + + + +#### SourceProcess.run + +```python +def run() -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L80) + +An entrypoint of the child process. + +Responsible for: + * Configuring the signal handlers to handle shutdown properly + * Execution of the source `run` method + * Reporting the source exceptions to the parent process + + + +#### SourceProcess.raise\_for\_error + +```python +def raise_for_error() -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L196) + +Raise a `quixstreams.sources.manager.SourceException` +if the child process was terminated with an exception. + + + +#### SourceProcess.stop + +```python +def stop() +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L220) + +Handle shutdown of the source and its subprocess. + +First, it tries to shut down gracefully by sending a SIGTERM and waiting up to +`source.shutdown_timeout` seconds for the process to exit. If the process +is still alive, it will kill it with a SIGKILL. + + + +### SourceManager + +```python +class SourceManager() +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L243) + +Class managing the sources registered with the app + +Sources run in their separate process pay attention about cross-process communication + + + +#### SourceManager.register + +```python +def register(source: BaseSource, topic, producer, consumer, + topic_manager) -> SourceProcess +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L253) + +Register a new source in the manager. + +Each source need to already be configured, can't reuse a topic and must be unique + + + +#### SourceManager.raise\_for\_error + +```python +def raise_for_error() -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L304) + +Raise an exception if any process has stopped with an exception + + + +#### SourceManager.is\_alive + +```python +def is_alive() -> bool +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/manager.py#L311) + +Check if any process is alive + +**Returns**: + +True if at least one process is alive + + + +## quixstreams.sources.base.source + + + +### BaseSource + +```python +class BaseSource(ABC) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L17) + +This is the base class for all sources. + +Sources are executed in a sub-process of the main application. + +To create your own source you need to implement: + +* `start` +* `stop` +* `default_topic` + +`BaseSource` is the most basic interface, and the framework expects every +source to implement it. +Use `Source` to benefit from a base implementation. + +You can connect a source to a StreamingDataframe using the Application. + +Example snippet: + +```python +class RandomNumbersSource(BaseSource): +def __init__(self): + super().__init__() + self._running = False + +def start(self): + self._running = True + + while self._running: + number = random.randint(0, 100) + serialized = self._producer_topic.serialize(value=number) + self._producer.produce( + topic=self._producer_topic.name, + key=serialized.key, + value=serialized.value, + ) + +def stop(self): + self._running = False + +def default_topic(self) -> Topic: + return Topic( + name="topic-name", + value_deserializer="json", + value_serializer="json", + ) + + +def main(): + app = Application(broker_address="localhost:9092") + source = RandomNumbersSource() + + sdf = app.dataframe(source=source) + sdf.print(metadata=True) + + app.run() + + +if __name__ == "__main__": + main() +``` + + + +#### BaseSource.configure + +```python +def configure(topic: Topic, producer: RowProducer, **kwargs) -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L88) + +This method is triggered before the source is started. + +It configures the source's Kafka producer, the topic it will produce to and optional dependencies. + + + +#### BaseSource.start + +```python +@abstractmethod +def start() -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L110) + +This method is triggered in the subprocess when the source is started. + +The subprocess will run as long as the start method executes. +Use it to fetch data and produce it to Kafka. + + + +#### BaseSource.stop + +```python +@abstractmethod +def stop() -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L119) + +This method is triggered when the application is shutting down. + +The source must ensure that the `run` method is completed soon. + + + +#### BaseSource.default\_topic + +```python +@abstractmethod +def default_topic() -> Topic +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L127) + +This method is triggered when the topic is not provided to the source. + +The source must return a default topic configuration. + +Note: if the default topic is used, the Application will prefix its name with "source__". + + + +### Source + +```python +class Source(BaseSource) +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L137) + +A base class for custom Sources that provides a basic implementation of `BaseSource` +interface. +It is recommended to interface to create custom sources. + +Subclass it and implement the `run` method to fetch data and produce it to Kafka. -#### Application.Quix +**Example**: + ```python -@classmethod -def Quix(cls, *args, **kwargs) -``` +import random +import time -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L352) +from quixstreams import Application +from quixstreams.sources import Source -RAISES EXCEPTION: DEPRECATED. -use Application() with "quix_sdk_token" parameter or set the "Quix__Sdk__Token" -environment variable. +class RandomNumbersSource(Source): + def run(self): + while self.running: + number = random.randint(0, 100) + serialized = self._producer_topic.serialize(value=number) + self.produce(key=str(number), value=serialized.value) + time.sleep(0.5) - -#### Application.topic +def main(): + app = Application(broker_address="localhost:9092") + source = RandomNumbersSource(name="random-source") + + sdf = app.dataframe(source=source) + sdf.print(metadata=True) + + app.run() + + +if __name__ == "__main__": + main() +``` + + + Helper methods and properties: + + * `serialize()` + * `produce()` + * `flush()` + * `running` + + + +#### Source.\_\_init\_\_ ```python -def topic(name: str, - value_deserializer: DeserializerType = "json", - key_deserializer: DeserializerType = "bytes", - value_serializer: SerializerType = "json", - key_serializer: SerializerType = "bytes", - config: Optional[TopicConfig] = None, - timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic +def __init__(name: str, shutdown_timeout: float = 10) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L384) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L187) -Create a topic definition. +**Arguments**: -Allows you to specify serialization that should be used when consuming/producing -to the topic in the form of a string name (i.e. "json" for JSON) or a -serialization class instance directly, like JSONSerializer(). +- `name`: The source unique name. It is used to generate the topic configuration. +- `shutdown_timeout`: Time in second the application waits for the source to gracefully shutdown. + -Example Snippet: +#### Source.running ```python -from quixstreams import Application +@property +def running() -> bool +``` -# Specify an input and output topic for a `StreamingDataFrame` instance, -# where the output topic requires adjusting the key serializer. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L201) -app = Application() -input_topic = app.topic("input-topic", value_deserializer="json") -output_topic = app.topic( - "output-topic", key_serializer="str", value_serializer=JSONSerializer() -) -sdf = app.dataframe(input_topic) -sdf.to_topic(output_topic) -``` +Property indicating if the source is running. -**Arguments**: +The `stop` method will set it to `False`. Use it to stop the source gracefully. -- `name`: topic name ->***NOTE:*** If the application is created via `Quix.Application()`, -the topic name will be prefixed by Quix workspace id, and it will -be `-` -- `value_deserializer`: a deserializer type for values; default="json" -- `key_deserializer`: a deserializer type for keys; default="bytes" -- `value_serializer`: a serializer type for values; default="json" -- `key_serializer`: a serializer type for keys; default="bytes" -- `config`: optional topic configurations (for creation/validation) ->***NOTE:*** will not create without Application's auto_create_topics set -to True (is True by default) -- `timestamp_extractor`: a callable that returns a timestamp in -milliseconds from a deserialized message. Default - `None`. + -Example Snippet: +#### Source.cleanup ```python -app = Application(...) +def cleanup(failed: bool) -> None +``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L209) -def custom_ts_extractor( - value: Any, - headers: Optional[List[Tuple[str, bytes]]], - timestamp: float, - timestamp_type: TimestampType, -) -> int: - return value["timestamp"] +This method is triggered once the `run` method completes. -topic = app.topic("input-topic", timestamp_extractor=custom_ts_extractor) +Use it to clean up the resources and shut down the source gracefully. + +It flushes the producer when `_run` completes successfully. + + + +#### Source.stop + +```python +def stop() -> None ``` -**Returns**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L220) -`Topic` object +This method is triggered when the application is shutting down. - +It sets the `running` property to `False`. -#### Application.dataframe + + +#### Source.start ```python -def dataframe(topic: Optional[Topic] = None, - source: Optional[BaseSource] = None) -> StreamingDataFrame +def start() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L464) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L228) -A simple helper method that generates a `StreamingDataFrame`, which is used +This method is triggered in the subprocess when the source is started. -to define your message processing pipeline. +It marks the source as running, execute it's run method and ensure cleanup happens. -The topic is what the `StreamingDataFrame` will use as its input, unless -a source is provided (`topic` is optional when using a `source`). + -If both `topic` AND `source` are provided, the source will write to that topic -instead of its default topic (which the `StreamingDataFrame` then consumes). +#### Source.run -See :class:`quixstreams.dataframe.StreamingDataFrame` for more details. +```python +@abstractmethod +def run() +``` -Example Snippet: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L244) -```python -from quixstreams import Application +This method is triggered in the subprocess when the source is started. -# Set up an `app = Application` and `sdf = StreamingDataFrame`; -# add some operations to `sdf` and then run everything. +The subprocess will run as long as the run method executes. +Use it to fetch data and produce it to Kafka. -app = Application(broker_address='localhost:9092', consumer_group='group') -topic = app.topic('test-topic') -df = app.dataframe(topic) -df.apply(lambda value, context: print('New message', value) + -app.run() +#### Source.serialize + +```python +def serialize(key: Optional[object] = None, + value: Optional[object] = None, + headers: Optional[Headers] = None, + timestamp_ms: Optional[int] = None) -> KafkaMessage ``` -**Arguments**: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L252) -- `topic`: a `quixstreams.models.Topic` instance -to be used as an input topic. -- `source`: a `quixstreams.sources` "BaseSource" instance +Serialize data to bytes using the producer topic serializers and return a `quixstreams.models.messages.KafkaMessage`. **Returns**: -`StreamingDataFrame` object +`quixstreams.models.messages.KafkaMessage` - + -#### Application.stop +#### Source.produce ```python -def stop(fail: bool = False) +def produce(value: Optional[Union[str, bytes]] = None, + key: Optional[Union[str, bytes]] = None, + headers: Optional[Headers] = None, + partition: Optional[int] = None, + timestamp: Optional[int] = None, + poll_timeout: float = 5.0, + buffer_error_max_tries: int = 3) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L520) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L268) -Stop the internal poll loop and the message processing. +Produce a message to the configured source topic in Kafka. -Only necessary when manually managing the lifecycle of the `Application` ( -likely through some sort of threading). + -To otherwise stop an application, either send a `SIGTERM` to the process -(like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`). +#### Source.flush + +```python +def flush(timeout: Optional[float] = None) -> None +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L293) + +This method flush the producer. + +It ensures all messages are successfully delivered to Kafka. **Arguments**: -- `fail`: if True, signals that application is stopped due -to unhandled exception, and it shouldn't commit the current checkpoint. +- `timeout` (`float`): time to attempt flushing (seconds). +None use producer default or -1 is infinite. Default: None - +**Raises**: -#### Application.get\_producer +- `CheckpointProducerTimeout`: if any message fails to produce before the timeout + + + +#### Source.default\_topic ```python -def get_producer() -> Producer +def default_topic() -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L565) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L311) -Create and return a pre-configured Producer instance. -The Producer is initialized with params passed to Application. +Return a default topic matching the source name. -It's useful for producing data to Kafka outside the standard Application processing flow, -(e.g. to produce test data into a topic). -Using this within the StreamingDataFrame functions is not recommended, as it creates a new Producer -instance each time, which is not optimized for repeated use in a streaming pipeline. +The default topic will not be used if the topic has already been provided to the source. -Example Snippet: +Note: if the default topic is used, the Application will prefix its name with "source__". -```python -from quixstreams import Application +**Returns**: -app = Application(...) -topic = app.topic("input") +`quixstreams.models.topics.Topic` -with app.get_producer() as producer: - for i in range(100): - producer.produce(topic=topic.name, key=b"key", value=b"value") + + +### StatefulSource + +```python +class StatefulSource(Source) ``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L330) -#### Application.get\_consumer +A `Source` class for custom Sources that need a state. + +Subclasses are responsible for flushing, by calling `flush`, at reasonable intervals. + +**Example**: + ```python -def get_consumer(auto_commit_enable: bool = True) -> Consumer -``` +import random +import time -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L613) +from quixstreams import Application +from quixstreams.sources import StatefulSource -Create and return a pre-configured Consumer instance. -The Consumer is initialized with params passed to Application. +class RandomNumbersSource(StatefulSource): + def run(self): -It's useful for consuming data from Kafka outside the standard -Application processing flow. -(e.g., to consume test data from a topic). -Using it within the StreamingDataFrame functions is not recommended, as it -creates a new Consumer instance -each time, which is not optimized for repeated use in a streaming pipeline. + i = 0 + while self.running: + previous = self.state.get("number", 0) + current = random.randint(0, 100) + self.state.set("number", current) -Note: By default, this consumer does not autocommit the consumed offsets to allow -at-least-once processing. -To store the offset call store_offsets() after processing a message. -If autocommit is necessary set `enable.auto.offset.store` to True in -the consumer config when creating the app. + serialized = self._producer_topic.serialize(value=current + previous) + self.produce(key=str(current), value=serialized.value) + time.sleep(0.5) -Example Snippet: + # flush the state every 10 messages + i += 1 + if i % 10 == 0: + self.flush() -```python -from quixstreams import Application -app = Application(...) -topic = app.topic("input") +def main(): + app = Application(broker_address="localhost:9092") + source = RandomNumbersSource(name="random-source") -with app.get_consumer() as consumer: - consumer.subscribe([topic.name]) - while True: - msg = consumer.poll(timeout=1.0) - if msg is not None: - # Process message - # Optionally commit the offset - # consumer.store_offsets(msg) + sdf = app.dataframe(source=source) + sdf.print(metadata=True) -``` + app.run() -**Arguments**: -- `auto_commit_enable`: Enable or disable auto commit -Default - True +if __name__ == "__main__": + main() +``` - + -#### Application.clear\_state +#### StatefulSource.\_\_init\_\_ ```python -def clear_state() +def __init__(name: str, shutdown_timeout: float = 10) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L663) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L380) -Clear the state of the application. +**Arguments**: - +- `name`: The source unique name. It is used to generate the topic configuration. +- `shutdown_timeout`: Time in second the application waits for the source to gracefully shutdown. -#### Application.add\_source + + +#### StatefulSource.configure ```python -def add_source(source: BaseSource, topic: Optional[Topic] = None) -> Topic +def configure(topic: Topic, + producer: RowProducer, + *, + store_partition: Optional[StorePartition] = None, + **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L669) - -Add a source to the application. - -Use when no transformations (which requires a `StreamingDataFrame`) are needed. - -See :class:`quixstreams.sources.base.BaseSource` for more details. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L390) -**Arguments**: +This method is triggered before the source is started. -- `source`: a :class:`quixstreams.sources.BaseSource` instance -- `topic`: the :class:`quixstreams.models.Topic` instance the source will produce to -Default - the topic generated by the `source.default_topic()` method. -Note: the names of default topics are prefixed with "source__". +It configures the source's Kafka producer, the topic it will produce to and the store partition. - + -#### Application.run +#### StatefulSource.store\_partitions\_count ```python -def run(dataframe: Optional[StreamingDataFrame] = None) +@property +def store_partitions_count() -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L700) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L409) -Start processing data from Kafka using provided `StreamingDataFrame` +Count of store partitions. -Once started, it can be safely terminated with a `SIGTERM` signal -(like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`). +Used to configure the number of partition in the changelog topic. + -Example Snippet: +#### StatefulSource.assigned\_store\_partition ```python -from quixstreams import Application - -# Set up an `app = Application` and `sdf = StreamingDataFrame`; -# add some operations to `sdf` and then run everything. +@property +def assigned_store_partition() -> int +``` -app = Application(broker_address='localhost:9092', consumer_group='group') -topic = app.topic('test-topic') -df = app.dataframe(topic) -df.apply(lambda value, context: print('New message', value) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L418) -app.run() -``` +The store partition assigned to this instance - + -#### Application.setup\_topics +#### StatefulSource.store\_name ```python -def setup_topics() +@property +def store_name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L823) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L425) -Validate and create the topics +The source store name - + -### ApplicationConfig +#### StatefulSource.state ```python -class ApplicationConfig(BaseSettings) +@property +def state() -> State ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L999) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L432) -Immutable object holding the application configuration +Access the `State` of the source. -For details see :class:`quixstreams.Application` +The `State` lifecycle is tied to the store transaction. A transaction is only valid until the next `.flush()` call. If no valid transaction exist, a new transaction is created. - +Important: after each `.flush()` call, a previously returned instance is invalidated and cannot be used. The property must be called again. -#### ApplicationConfig.settings\_customise\_sources + + +#### StatefulSource.flush ```python -@classmethod -def settings_customise_sources( - cls, settings_cls: Type[PydanticBaseSettings], - init_settings: PydanticBaseSettingsSource, - env_settings: PydanticBaseSettingsSource, - dotenv_settings: PydanticBaseSettingsSource, - file_secret_settings: PydanticBaseSettingsSource -) -> Tuple[PydanticBaseSettingsSource, ...] +def flush(timeout: Optional[float] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L1034) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L451) -Included to ignore reading/setting values from the environment +This method commit the state and flush the producer. - +It ensures the state is published to the changelog topic and all messages are successfully delivered to Kafka. -#### ApplicationConfig.copy +**Arguments**: -```python -def copy(**kwargs) -> Self -``` +- `timeout` (`float`): time to attempt flushing (seconds). +None use producer default or -1 is infinite. Default: None -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/app.py#L1047) +**Raises**: -Update the application config and return a copy +- `CheckpointProducerTimeout`: if any message fails to produce before the timeout - + -## quixstreams.sources.core +## quixstreams.sources.core.kafka @@ -10570,7 +10684,7 @@ Update the application config and return a copy class Checkpoint(BaseCheckpoint) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/checkpoint.py#L15) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/checkpoint.py#L15) Checkpoint implementation used by the KafkaReplicatorSource @@ -10582,7 +10696,7 @@ Checkpoint implementation used by the KafkaReplicatorSource def close() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/checkpoint.py#L41) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/checkpoint.py#L41) Perform cleanup (when the checkpoint is empty) instead of committing. @@ -10596,7 +10710,7 @@ Needed for exactly-once, as Kafka transactions are timeboxed. def commit() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/checkpoint.py#L50) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/checkpoint.py#L50) Commit the checkpoint. @@ -10604,10 +10718,6 @@ This method will: 1. Flush the producer to ensure everything is delivered. 2. Commit topic offsets. - - -## quixstreams.sources.core.kafka - ## quixstreams.sources.core.kafka.kafka @@ -10620,7 +10730,7 @@ This method will: class KafkaReplicatorSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/kafka.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/kafka.py#L25) Source implementation that replicates a topic from a Kafka broker to your application broker. @@ -10667,7 +10777,7 @@ def __init__( key_deserializer: DeserializerType = "bytes") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/kafka.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/kafka.py#L54) **Arguments**: @@ -10704,7 +10814,7 @@ Default - `json` class QuixEnvironmentSource(KafkaReplicatorSource) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/quix.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/quix.py#L19) Source implementation that replicates a topic from a Quix Cloud environment to your application broker. It can copy messages for development and testing without risking producing them back or affecting the consumer groups. @@ -10755,7 +10865,7 @@ def __init__( key_deserializer: DeserializerType = "bytes") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/quix.py#L50) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/quix.py#L50) **Arguments**: @@ -10766,6 +10876,10 @@ Default - `Quix__Portal__Api` environment variable or Quix cloud production URL For other parameters See `quixstreams.sources.kafka.KafkaReplicatorSource` + + +## quixstreams.sources.core + ## quixstreams.sources.core.csv @@ -10778,7 +10892,7 @@ For other parameters See `quixstreams.sources.kafka.KafkaReplicatorSource` class CSVSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/csv.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/csv.py#L13) @@ -10795,7 +10909,7 @@ def __init__(path: Union[str, Path], dialect: str = "excel") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/csv.py#L14) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/csv.py#L14) A base CSV source that reads data from a CSV file and produces rows @@ -10821,162 +10935,199 @@ Default - `0`. See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info. Default - `"excel"`. - + -## quixstreams.sources +## quixstreams.sources.community - +This module contains Sources developed and maintained by the members of Quix Streams community. -## quixstreams.sources.community.kinesis.kinesis + - +## quixstreams.sources.community.pubsub -### KinesisSource + + +## quixstreams.sources.community.pubsub.consumer + + + +### PubSubSubscriptionNotFound ```python -class KinesisSource(StatefulSource) +class PubSubSubscriptionNotFound(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/kinesis.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/consumer.py#L30) -NOTE: Requires `pip install quixstreams[kinesis]` to work. +Raised when an expected subscription does not exist -This source reads data from an Amazon Kinesis stream, dumping it to a -kafka topic using desired `StreamingDataFrame`-based transformations. + -Provides "at-least-once" guarantees. +### PubSubConsumer -The incoming message value will be in bytes, so transform in your SDF accordingly. +```python +class PubSubConsumer() +``` -Example Usage: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/consumer.py#L34) + + + +#### PubSubConsumer.poll\_and\_process ```python -from quixstreams import Application -from quixstreams.sources.community.kinesis import KinesisSource +def poll_and_process(timeout: Optional[float] = None) +``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/consumer.py#L105) -kinesis = KinesisSource( - stream_name="", - aws_access_key_id="", - aws_secret_access_key="", - aws_region="", - auto_offset_reset="earliest", # start from the beginning of the stream (vs end) -) +This uses the asynchronous puller to retrieve and handle a message with its +assigned callback. -app = Application( - broker_address="", - consumer_group="", -) +Committing is a separate step. -sdf = app.dataframe(source=kinesis).print(metadata=True) -# YOUR LOGIC HERE! + -if __name__ == "__main__": - app.run() +#### PubSubConsumer.poll\_and\_process\_batch + +```python +def poll_and_process_batch() +``` + +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/consumer.py#L122) + +Polls and processes until either the max_batch_size or batch_timeout is reached. + + + +#### PubSubConsumer.subscribe + +```python +def subscribe() ``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/consumer.py#L134) + +Asynchronous subscribers require subscribing (synchronous do not). + +NOTE: This will not detect whether the subscription exists. -#### KinesisSource.\_\_init\_\_ + + +#### PubSubConsumer.handle\_subscription ```python -def __init__( - stream_name: str, - aws_region: Optional[str] = getenv("AWS_REGION"), - aws_access_key_id: Optional[str] = getenv("AWS_ACCESS_KEY_ID"), - aws_secret_access_key: Optional[str] = getenv("AWS_SECRET_ACCESS_KEY"), - aws_endpoint_url: Optional[str] = getenv("AWS_ENDPOINT_URL_KINESIS"), - shutdown_timeout: float = 10, - auto_offset_reset: AutoOffsetResetType = "latest", - max_records_per_shard: int = 1000, - commit_interval: float = 5.0, - retry_backoff_secs: float = 5.0) +def handle_subscription() -> Subscription ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/kinesis.py#L57) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/consumer.py#L144) -**Arguments**: +Handles subscription management in one place. -- `stream_name`: name of the desired Kinesis stream to consume. -- `aws_region`: The AWS region. -NOTE: can alternatively set the AWS_REGION environment variable -- `aws_access_key_id`: the AWS access key ID. -NOTE: can alternatively set the AWS_ACCESS_KEY_ID environment variable -- `aws_secret_access_key`: the AWS secret access key. -NOTE: can alternatively set the AWS_SECRET_ACCESS_KEY environment variable -- `aws_endpoint_url`: the endpoint URL to use; only required for connecting -to a locally hosted Kinesis. -NOTE: can alternatively set the AWS_ENDPOINT_URL_KINESIS environment variable -- `shutdown_timeout`: -- `auto_offset_reset`: When no previous offset has been recorded, whether to -start from the beginning ("earliest") or end ("latest") of the stream. -- `max_records_per_shard`: During round-robin consumption, how many records -to consume per shard (partition) per consume (NOT per-commit). -- `commit_interval`: the time between commits -- `retry_backoff_secs`: how long to back off from doing HTTP calls for a -shard when Kinesis consumer encounters handled/expected errors. +Subscriptions work similarly to Kafka consumer groups. - +- Each topic can have multiple subscriptions (consumer group ~= subscription). -## quixstreams.sources.community.kinesis +- A subscription can have multiple subscribers (similar to consumers in a group). - +- NOTE: exactly-once adds message methods (ack_with_response) when enabled. -## quixstreams.sources.community.kinesis.consumer + - +## quixstreams.sources.community.pubsub.pubsub -### KinesisStreamShardsNotFound + + +### PubSubSource ```python -class KinesisStreamShardsNotFound(Exception) +class PubSubSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/consumer.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/pubsub.py#L16) -Raised when the Kinesis Stream has no shards +This source enables reading from a Google Cloud Pub/Sub topic, +dumping it to a kafka topic using desired SDF-based transformations. - +Provides "at-least-once" guarantees. -### KinesisConsumer +Currently, forwarding message keys ("ordered messages" in Pub/Sub) is unsupported. + +The incoming message value will be in bytes, so transform in your SDF accordingly. + +Example Usage: ```python -class KinesisConsumer() -``` +from quixstreams import Application +from quixstreams.sources.community.pubsub import PubSubSource +from os import environ -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/consumer.py#L63) +source = PubSubSource( + # Suggested: pass JSON-formatted credentials from an environment variable. + service_account_json = environ["PUBSUB_SERVICE_ACCOUNT_JSON"], + project_id="", + topic_id="", # NOTE: NOT the full /x/y/z path! + subscription_id="", # NOTE: NOT the full /x/y/z path! + create_subscription=True, +) +app = Application( + broker_address="localhost:9092", + auto_offset_reset="earliest", + consumer_group="gcp", + loglevel="INFO" +) +sdf = app.dataframe(source=source).print(metadata=True) -Consume all shards for a given Kinesis stream in a batched, round-robin fashion. -Also handles checkpointing of said stream (requires a `KinesisCheckpointer`). +if __name__ == "__main__": + app.run() +``` - + -#### KinesisConsumer.process\_shards +#### PubSubSource.\_\_init\_\_ ```python -def process_shards() +def __init__(project_id: str, + topic_id: str, + subscription_id: str, + service_account_json: Optional[str] = None, + commit_every: int = 100, + commit_interval: float = 5.0, + create_subscription: bool = False, + enable_message_ordering: bool = False, + shutdown_timeout: float = 10.0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/consumer.py#L90) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/pubsub.py#L55) -Process records from the Stream shards one by one and checkpoint their -sequence numbers. +**Arguments**: - +- `project_id`: a Google Cloud project ID. +- `topic_id`: a Pub/Sub topic ID (NOT the full path). +- `subscription_id`: a Pub/Sub subscription ID (NOT the full path). +- `service_account_json`: a Google Cloud Credentials JSON as a string +Can instead use environment variables (which have different behavior): +- "GOOGLE_APPLICATION_CREDENTIALS" set to a JSON filepath i.e. /x/y/z.json +- "PUBSUB_EMULATOR_HOST" set to a URL if using an emulated Pub/Sub +- `commit_every`: max records allowed to be processed before committing. +- `commit_interval`: max allowed elapsed time between commits. +- `create_subscription`: whether to attempt to create a subscription at +startup; if it already exists, it instead logs its details (DEBUG level). +- `enable_message_ordering`: When creating a Pub/Sub subscription, whether +to allow message ordering. NOTE: does NOT affect existing subscriptions! +- `shutdown_timeout`: How long to wait for a graceful shutdown of the source. -#### KinesisConsumer.commit + -```python -def commit(force: bool = False) -``` +## quixstreams.sources.community.file.compressions -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/consumer.py#L104) + -Commit the checkpoint and save the progress of the +## quixstreams.sources.community.file.compressions.base - + -## quixstreams.sources.community.file.formats.parquet +## quixstreams.sources.community.file.compressions.gzip @@ -10994,7 +11145,7 @@ Commit the checkpoint and save the progress of the class JSONFormat(Format) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/json.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/json.py#L12) @@ -11005,7 +11156,7 @@ def __init__(compression: Optional[CompressionName], loads: Optional[Callable[[str], dict]] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/json.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/json.py#L13) Read a JSON-formatted file (along with decompressing it). @@ -11015,6 +11166,10 @@ Read a JSON-formatted file (along with decompressing it). - `loads`: A custom function to deserialize objects to the expected dict with {_key: str, _value: dict, _timestamp: int}. + + +## quixstreams.sources.community.file.formats.parquet + ## quixstreams.sources.community.file.formats.base @@ -11027,7 +11182,7 @@ with {_key: str, _value: dict, _timestamp: int}. class Format(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/base.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/base.py#L13) Base class for reading files serialized by the Quix Streams File Sink Connector. @@ -11045,7 +11200,7 @@ Also handles different compression types. def __init__(compression: Optional[CompressionName] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/base.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/base.py#L24) super().__init__() this for a usable init. @@ -11058,7 +11213,7 @@ super().__init__() this for a usable init. def deserialize(filestream: BinaryIO) -> Iterable[dict] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/base.py#L33) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/base.py#L33) Parse a filelike byte stream into a collection of records @@ -11074,140 +11229,28 @@ The iterable should output dicts with the following data/naming structure: - `filestream`: a filelike byte stream (such as `f` from `f = open(file)`) - - -## quixstreams.sources.community.file - - - -## quixstreams.sources.community.file.file - - - -### FileSource - -```python -class FileSource(Source) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/file.py#L19) - -Ingest a set of files from a desired origin into Kafka by iterating through the -provided folder and processing all nested files within it. - -Origins include a local filestore, AWS S3, or Microsoft Azure. - -FileSource defaults to a local filestore (LocalOrigin) + JSON format. - -Expects folder and file structures as generated by the related FileSink connector: - -``` -my_topics/ -├── topic_a/ -│ ├── 0/ -│ │ ├── 0000.ext -│ │ └── 0011.ext -│ └── 1/ -│ ├── 0003.ext -│ └── 0016.ext -└── topic_b/ - └── etc... -``` - -Intended to be used with a single topic (ex: topic_a), but will recursively read -from whatever entrypoint is passed to it. - -File format structure depends on the file format. - -See the `.formats` and `.compressions` modules to see what is supported. - -Example Usage: - -```python -from quixstreams import Application -from quixstreams.sources.community.file import FileSource -from quixstreams.sources.community.file.origins import S3Origin - -app = Application(broker_address="localhost:9092", auto_offset_reset="earliest") - -origin = S3Origin( - bucket="", - aws_access_key_id="", - aws_secret_access_key="", - aws_region="", -) -source = FileSource( - directory="path/to/your/topic_folder/", - origin=origin, - format="json", - compression="gzip", -) -sdf = app.dataframe(source=source).print(metadata=True) -# YOUR LOGIC HERE! - -if __name__ == "__main__": - app.run() -``` - - - -#### FileSource.\_\_init\_\_ - -```python -def __init__(directory: Union[str, Path], - format: Union[Format, FormatName] = "json", - origin: Origin = LocalOrigin(), - compression: Optional[CompressionName] = None, - replay_speed: float = 1.0, - name: Optional[str] = None, - shutdown_timeout: float = 10) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/file.py#L79) - -**Arguments**: + -- `directory`: a directory to recursively read through; it is recommended to -provide the path to a given topic folder (ex: `/path/to/topic_a`). -- `format`: what format the message files are in (ex: json, parquet). -Optionally, can provide a `Format` instance if more than compression -is necessary to define (compression will then be ignored). -- `origin`: an Origin type (defaults to reading local files). -- `compression`: what compression is used on the given files, if any. -- `replay_speed`: Produce the messages with this speed multiplier, which -roughly reflects the time "delay" between the original message producing. -Use any float >= 0, where 0 is no delay, and 1 is the original speed. -NOTE: Time delay will only be accurate per partition, NOT overall. -- `name`: The name of the Source application (Default: last folder name). -- `shutdown_timeout`: Time in seconds the application waits for the source -to gracefully shutdown +## quixstreams.sources.community.file.origins.base - + -#### FileSource.default\_topic +### Origin ```python -def default_topic() -> Topic -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/file.py#L152) - -Uses the file structure to generate the desired partition count for the +class Origin(ABC) +``` -internal topic. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/origins/base.py#L8) -**Returns**: +An interface for interacting with a file-based client. -the original default topic, with updated partition count +Provides methods for navigating folders and retrieving/opening raw files. ## quixstreams.sources.community.file.origins.local - - -## quixstreams.sources.community.file.origins - ## quixstreams.sources.community.file.origins.s3 @@ -11220,7 +11263,7 @@ the original default topic, with updated partition count class S3Origin(Origin) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/origins/s3.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/origins/s3.py#L23) @@ -11235,7 +11278,7 @@ def __init__( endpoint_url: Optional[str] = getenv("AWS_ENDPOINT_URL_S3")) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/origins/s3.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/origins/s3.py#L24) Configure IcebergSink to work with AWS Glue. @@ -11252,902 +11295,929 @@ NOTE: can alternatively set the AWS_SECRET_ACCESS_KEY environment variable to a locally hosted S3. NOTE: can alternatively set the AWS_ENDPOINT_URL_S3 environment variable - - -## quixstreams.sources.community.file.origins.base - - - -### Origin - -```python -class Origin(ABC) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/origins/base.py#L8) - -An interface for interacting with a file-based client. - -Provides methods for navigating folders and retrieving/opening raw files. - - - -## quixstreams.sources.community.file.compressions.gzip - - - -## quixstreams.sources.community.file.compressions - - - -## quixstreams.sources.community.file.compressions.base - - - -## quixstreams.sources.community - -This module contains Sources developed and maintained by the members of Quix Streams community. - - - -## quixstreams.sources.community.pubsub - - - -## quixstreams.sources.community.pubsub.consumer - - - -### PubSubSubscriptionNotFound - -```python -class PubSubSubscriptionNotFound(Exception) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/consumer.py#L30) - -Raised when an expected subscription does not exist - - - -### PubSubConsumer - -```python -class PubSubConsumer() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/consumer.py#L34) - - - -#### PubSubConsumer.poll\_and\_process - -```python -def poll_and_process(timeout: Optional[float] = None) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/consumer.py#L105) - -This uses the asynchronous puller to retrieve and handle a message with its -assigned callback. - -Committing is a separate step. + - +## quixstreams.sources.community.file.origins -#### PubSubConsumer.poll\_and\_process\_batch + -```python -def poll_and_process_batch() -``` +## quixstreams.sources.community.file -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/consumer.py#L122) + -Polls and processes until either the max_batch_size or batch_timeout is reached. +## quixstreams.sources.community.file.file - + -#### PubSubConsumer.subscribe +### FileSource ```python -def subscribe() +class FileSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/consumer.py#L134) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/file.py#L19) -Asynchronous subscribers require subscribing (synchronous do not). +Ingest a set of files from a desired origin into Kafka by iterating through the +provided folder and processing all nested files within it. -NOTE: This will not detect whether the subscription exists. +Origins include a local filestore, AWS S3, or Microsoft Azure. - +FileSource defaults to a local filestore (LocalOrigin) + JSON format. -#### PubSubConsumer.handle\_subscription +Expects folder and file structures as generated by the related FileSink connector: -```python -def handle_subscription() -> Subscription ``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/consumer.py#L144) - -Handles subscription management in one place. - -Subscriptions work similarly to Kafka consumer groups. - -- Each topic can have multiple subscriptions (consumer group ~= subscription). - -- A subscription can have multiple subscribers (similar to consumers in a group). - -- NOTE: exactly-once adds message methods (ack_with_response) when enabled. - - - -## quixstreams.sources.community.pubsub.pubsub - - - -### PubSubSource - -```python -class PubSubSource(Source) +my_topics/ +├── topic_a/ +│ ├── 0/ +│ │ ├── 0000.ext +│ │ └── 0011.ext +│ └── 1/ +│ ├── 0003.ext +│ └── 0016.ext +└── topic_b/ + └── etc... ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/pubsub.py#L16) - -This source enables reading from a Google Cloud Pub/Sub topic, -dumping it to a kafka topic using desired SDF-based transformations. - -Provides "at-least-once" guarantees. +Intended to be used with a single topic (ex: topic_a), but will recursively read +from whatever entrypoint is passed to it. -Currently, forwarding message keys ("ordered messages" in Pub/Sub) is unsupported. +File format structure depends on the file format. -The incoming message value will be in bytes, so transform in your SDF accordingly. +See the `.formats` and `.compressions` modules to see what is supported. Example Usage: ```python from quixstreams import Application -from quixstreams.sources.community.pubsub import PubSubSource -from os import environ +from quixstreams.sources.community.file import FileSource +from quixstreams.sources.community.file.origins import S3Origin -source = PubSubSource( - # Suggested: pass JSON-formatted credentials from an environment variable. - service_account_json = environ["PUBSUB_SERVICE_ACCOUNT_JSON"], - project_id="", - topic_id="", # NOTE: NOT the full /x/y/z path! - subscription_id="", # NOTE: NOT the full /x/y/z path! - create_subscription=True, +app = Application(broker_address="localhost:9092", auto_offset_reset="earliest") + +origin = S3Origin( + bucket="", + aws_access_key_id="", + aws_secret_access_key="", + aws_region="", ) -app = Application( - broker_address="localhost:9092", - auto_offset_reset="earliest", - consumer_group="gcp", - loglevel="INFO" +source = FileSource( + directory="path/to/your/topic_folder/", + origin=origin, + format="json", + compression="gzip", ) sdf = app.dataframe(source=source).print(metadata=True) +# YOUR LOGIC HERE! if __name__ == "__main__": app.run() ``` - - -#### PubSubSource.\_\_init\_\_ - -```python -def __init__(project_id: str, - topic_id: str, - subscription_id: str, - service_account_json: Optional[str] = None, - commit_every: int = 100, - commit_interval: float = 5.0, - create_subscription: bool = False, - enable_message_ordering: bool = False, - shutdown_timeout: float = 10.0) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/pubsub.py#L55) - -**Arguments**: - -- `project_id`: a Google Cloud project ID. -- `topic_id`: a Pub/Sub topic ID (NOT the full path). -- `subscription_id`: a Pub/Sub subscription ID (NOT the full path). -- `service_account_json`: a Google Cloud Credentials JSON as a string -Can instead use environment variables (which have different behavior): -- "GOOGLE_APPLICATION_CREDENTIALS" set to a JSON filepath i.e. /x/y/z.json -- "PUBSUB_EMULATOR_HOST" set to a URL if using an emulated Pub/Sub -- `commit_every`: max records allowed to be processed before committing. -- `commit_interval`: max allowed elapsed time between commits. -- `create_subscription`: whether to attempt to create a subscription at -startup; if it already exists, it instead logs its details (DEBUG level). -- `enable_message_ordering`: When creating a Pub/Sub subscription, whether -to allow message ordering. NOTE: does NOT affect existing subscriptions! -- `shutdown_timeout`: How long to wait for a graceful shutdown of the source. - - - -## quixstreams.sources.base - - - -## quixstreams.sources.base.exceptions - - - -### SourceException - -```python -class SourceException(Exception) -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/exceptions.py#L9) - -Raised in the parent process when a source finish with an exception - - - -## quixstreams.sources.base.source - - + -### BaseSource +#### FileSource.\_\_init\_\_ ```python -class BaseSource(ABC) +def __init__(directory: Union[str, Path], + format: Union[Format, FormatName] = "json", + origin: Origin = LocalOrigin(), + compression: Optional[CompressionName] = None, + replay_speed: float = 1.0, + name: Optional[str] = None, + shutdown_timeout: float = 10) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L17) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/file.py#L79) -This is the base class for all sources. - -Sources are executed in a sub-process of the main application. +**Arguments**: -To create your own source you need to implement: +- `directory`: a directory to recursively read through; it is recommended to +provide the path to a given topic folder (ex: `/path/to/topic_a`). +- `format`: what format the message files are in (ex: json, parquet). +Optionally, can provide a `Format` instance if more than compression +is necessary to define (compression will then be ignored). +- `origin`: an Origin type (defaults to reading local files). +- `compression`: what compression is used on the given files, if any. +- `replay_speed`: Produce the messages with this speed multiplier, which +roughly reflects the time "delay" between the original message producing. +Use any float >= 0, where 0 is no delay, and 1 is the original speed. +NOTE: Time delay will only be accurate per partition, NOT overall. +- `name`: The name of the Source application (Default: last folder name). +- `shutdown_timeout`: Time in seconds the application waits for the source +to gracefully shutdown -* `start` -* `stop` -* `default_topic` + -`BaseSource` is the most basic interface, and the framework expects every -source to implement it. -Use `Source` to benefit from a base implementation. +#### FileSource.default\_topic -You can connect a source to a StreamingDataframe using the Application. +```python +def default_topic() -> Topic +``` -Example snippet: +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/file.py#L152) -```python -class RandomNumbersSource(BaseSource): -def __init__(self): - super().__init__() - self._running = False +Uses the file structure to generate the desired partition count for the -def start(self): - self._running = True +internal topic. - while self._running: - number = random.randint(0, 100) - serialized = self._producer_topic.serialize(value=number) - self._producer.produce( - topic=self._producer_topic.name, - key=serialized.key, - value=serialized.value, - ) +**Returns**: -def stop(self): - self._running = False +the original default topic, with updated partition count -def default_topic(self) -> Topic: - return Topic( - name="topic-name", - value_deserializer="json", - value_serializer="json", - ) + +## quixstreams.sources.community.kinesis -def main(): - app = Application(broker_address="localhost:9092") - source = RandomNumbersSource() + - sdf = app.dataframe(source=source) - sdf.print(metadata=True) +## quixstreams.sources.community.kinesis.consumer - app.run() + +### KinesisStreamShardsNotFound -if __name__ == "__main__": - main() +```python +class KinesisStreamShardsNotFound(Exception) ``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/consumer.py#L28) -#### BaseSource.configure +Raised when the Kinesis Stream has no shards + + + +### KinesisConsumer ```python -def configure(topic: Topic, producer: RowProducer, **kwargs) -> None +class KinesisConsumer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L88) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/consumer.py#L63) -This method is triggered before the source is started. - -It configures the source's Kafka producer, the topic it will produce to and optional dependencies. +Consume all shards for a given Kinesis stream in a batched, round-robin fashion. +Also handles checkpointing of said stream (requires a `KinesisCheckpointer`). - + -#### BaseSource.start +#### KinesisConsumer.process\_shards ```python -@abstractmethod -def start() -> None +def process_shards() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L110) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/consumer.py#L90) -This method is triggered in the subprocess when the source is started. - -The subprocess will run as long as the start method executes. -Use it to fetch data and produce it to Kafka. +Process records from the Stream shards one by one and checkpoint their +sequence numbers. - + -#### BaseSource.stop +#### KinesisConsumer.commit ```python -@abstractmethod -def stop() -> None +def commit(force: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L119) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/consumer.py#L104) -This method is triggered when the application is shutting down. +Commit the checkpoint and save the progress of the -The source must ensure that the `run` method is completed soon. + - +## quixstreams.sources.community.kinesis.kinesis -#### BaseSource.default\_topic + + +### KinesisSource ```python -@abstractmethod -def default_topic() -> Topic +class KinesisSource(StatefulSource) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L127) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/kinesis.py#L18) -This method is triggered when the topic is not provided to the source. +NOTE: Requires `pip install quixstreams[kinesis]` to work. -The source must return a default topic configuration. +This source reads data from an Amazon Kinesis stream, dumping it to a +kafka topic using desired `StreamingDataFrame`-based transformations. -Note: if the default topic is used, the Application will prefix its name with "source__". +Provides "at-least-once" guarantees. - +The incoming message value will be in bytes, so transform in your SDF accordingly. -### Source +Example Usage: ```python -class Source(BaseSource) -``` +from quixstreams import Application +from quixstreams.sources.community.kinesis import KinesisSource -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L137) -A base class for custom Sources that provides a basic implementation of `BaseSource` -interface. -It is recommended to interface to create custom sources. +kinesis = KinesisSource( + stream_name="", + aws_access_key_id="", + aws_secret_access_key="", + aws_region="", + auto_offset_reset="earliest", # start from the beginning of the stream (vs end) +) -Subclass it and implement the `run` method to fetch data and produce it to Kafka. +app = Application( + broker_address="", + consumer_group="", +) -**Example**: +sdf = app.dataframe(source=kinesis).print(metadata=True) +# YOUR LOGIC HERE! - -```python -import random -import time +if __name__ == "__main__": + app.run() +``` -from quixstreams import Application -from quixstreams.sources import Source + +#### KinesisSource.\_\_init\_\_ -class RandomNumbersSource(Source): - def run(self): - while self.running: - number = random.randint(0, 100) - serialized = self._producer_topic.serialize(value=number) - self.produce(key=str(number), value=serialized.value) - time.sleep(0.5) +```python +def __init__( + stream_name: str, + aws_region: Optional[str] = getenv("AWS_REGION"), + aws_access_key_id: Optional[str] = getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key: Optional[str] = getenv("AWS_SECRET_ACCESS_KEY"), + aws_endpoint_url: Optional[str] = getenv("AWS_ENDPOINT_URL_KINESIS"), + shutdown_timeout: float = 10, + auto_offset_reset: AutoOffsetResetType = "latest", + max_records_per_shard: int = 1000, + commit_interval: float = 5.0, + retry_backoff_secs: float = 5.0) +``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/kinesis.py#L57) -def main(): - app = Application(broker_address="localhost:9092") - source = RandomNumbersSource(name="random-source") +**Arguments**: - sdf = app.dataframe(source=source) - sdf.print(metadata=True) +- `stream_name`: name of the desired Kinesis stream to consume. +- `aws_region`: The AWS region. +NOTE: can alternatively set the AWS_REGION environment variable +- `aws_access_key_id`: the AWS access key ID. +NOTE: can alternatively set the AWS_ACCESS_KEY_ID environment variable +- `aws_secret_access_key`: the AWS secret access key. +NOTE: can alternatively set the AWS_SECRET_ACCESS_KEY environment variable +- `aws_endpoint_url`: the endpoint URL to use; only required for connecting +to a locally hosted Kinesis. +NOTE: can alternatively set the AWS_ENDPOINT_URL_KINESIS environment variable +- `shutdown_timeout`: +- `auto_offset_reset`: When no previous offset has been recorded, whether to +start from the beginning ("earliest") or end ("latest") of the stream. +- `max_records_per_shard`: During round-robin consumption, how many records +to consume per shard (partition) per consume (NOT per-commit). +- `commit_interval`: the time between commits +- `retry_backoff_secs`: how long to back off from doing HTTP calls for a +shard when Kinesis consumer encounters handled/expected errors. - app.run() + +## quixstreams.sources -if __name__ == "__main__": - main() -``` - - - Helper methods and properties: - - * `serialize()` - * `produce()` - * `flush()` - * `running` + - +## quixstreams.logging -#### Source.\_\_init\_\_ + + +#### configure\_logging ```python -def __init__(name: str, shutdown_timeout: float = 10) -> None +def configure_logging(loglevel: Optional[Union[int, LogLevel]], + name: str = LOGGER_NAME, + pid: bool = False) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L187) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/logging.py#L24) + +Configure "quixstreams" logger. + +>***NOTE:*** If "quixstreams" logger already has pre-defined handlers +(e.g. logging has already been configured via `logging`, or the function +is called twice), it will skip configuration and return `False`. **Arguments**: -- `name`: The source unique name. It is used to generate the topic configuration. -- `shutdown_timeout`: Time in second the application waits for the source to gracefully shutdown. +- `loglevel`: a valid log level as a string or None. +If None passed, this function is no-op and no logging will be configured. +- `name`: the log name included in the output +- `pid`: if True include the process PID in the logs - +**Returns**: -#### Source.running +True if logging config has been updated, otherwise False. -```python -@property -def running() -> bool -``` + -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L201) +## quixstreams.error\_callbacks -Property indicating if the source is running. + -The `stop` method will set it to `False`. Use it to stop the source gracefully. +## quixstreams.app - + -#### Source.cleanup +### Application ```python -def cleanup(failed: bool) -> None +class Application() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L209) - -This method is triggered once the `run` method completes. - -Use it to clean up the resources and shut down the source gracefully. - -It flushes the producer when `_run` completes successfully. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L75) - +The main Application class. -#### Source.stop +Typically, the primary object needed to get a kafka application up and running. -```python -def stop() -> None -``` +Most functionality is explained the various methods, except for +"column assignment". -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L220) -This method is triggered when the application is shutting down. +What it Does: -It sets the `running` property to `False`. +- On init: + - Provides defaults or helper methods for commonly needed objects + - If `quix_sdk_token` is passed, configures the app to use the Quix Cloud. +- When executed via `.run()` (after setup): + - Initializes Topics and StreamingDataFrames + - Facilitates processing of Kafka messages with a `StreamingDataFrame` + - Handles all Kafka client consumer/producer responsibilities. - -#### Source.start +Example Snippet: ```python -def start() -> None -``` +from quixstreams import Application -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L228) +# Set up an `app = Application` and `sdf = StreamingDataFrame`; +# add some operations to `sdf` and then run everything. -This method is triggered in the subprocess when the source is started. +app = Application(broker_address='localhost:9092', consumer_group='group') +topic = app.topic('test-topic') +df = app.dataframe(topic) +df.apply(lambda value, context: print('New message', value)) -It marks the source as running, execute it's run method and ensure cleanup happens. +app.run() +``` - + -#### Source.run +#### Application.\_\_init\_\_ ```python -@abstractmethod -def run() +def __init__(broker_address: Optional[Union[str, ConnectionConfig]] = None, + *, + quix_sdk_token: Optional[str] = None, + consumer_group: Optional[str] = None, + auto_offset_reset: AutoOffsetReset = "latest", + commit_interval: float = 5.0, + commit_every: int = 0, + consumer_extra_config: Optional[dict] = None, + producer_extra_config: Optional[dict] = None, + state_dir: Union[str, Path] = Path("state"), + rocksdb_options: Optional[RocksDBOptionsType] = None, + on_consumer_error: Optional[ConsumerErrorCallback] = None, + on_processing_error: Optional[ProcessingErrorCallback] = None, + on_producer_error: Optional[ProducerErrorCallback] = None, + on_message_processed: Optional[MessageProcessedCallback] = None, + consumer_poll_timeout: float = 1.0, + producer_poll_timeout: float = 0.0, + loglevel: Optional[Union[int, LogLevel]] = "INFO", + auto_create_topics: bool = True, + use_changelog_topics: bool = True, + quix_config_builder: Optional[QuixKafkaConfigsBuilder] = None, + topic_manager: Optional[TopicManager] = None, + request_timeout: float = 30, + topic_create_timeout: float = 60, + processing_guarantee: ProcessingGuarantee = "at-least-once") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L113) -This method is triggered in the subprocess when the source is started. +**Arguments**: -The subprocess will run as long as the run method executes. -Use it to fetch data and produce it to Kafka. +- `broker_address`: Connection settings for Kafka. +Used by Producer, Consumer, and Admin clients. +Accepts string with Kafka broker host and port formatted as `:`, +or a ConnectionConfig object if authentication is required. +Either this OR `quix_sdk_token` must be set to use `Application` (not both). +Takes priority over quix auto-configuration. +Linked Environment Variable: `Quix__Broker__Address`. +Default: `None` +- `quix_sdk_token`: If using the Quix Cloud, the SDK token to connect with. +Either this OR `broker_address` must be set to use Application (not both). +Linked Environment Variable: `Quix__Sdk__Token`. +Default: None (if not run on Quix Cloud) + >***NOTE:*** the environment variable is set for you in the Quix Cloud +- `consumer_group`: Kafka consumer group. +Passed as `group.id` to `confluent_kafka.Consumer`. +Linked Environment Variable: `Quix__Consumer__Group`. +Default - "quixstreams-default" (set during init) + >***NOTE:*** Quix Applications will prefix it with the Quix workspace id. +- `commit_interval`: How often to commit the processed messages in seconds. +Default - 5.0. +- `commit_every`: Commit the checkpoint after processing N messages. +Use this parameter for more granular control of the commit schedule. +If the value is > 0, the application will commit the checkpoint after +processing the specified number of messages across all the assigned +partitions. +If the value is <= 0, only the `commit_interval` will be considered. +Default - 0. + >***NOTE:*** Only input offsets are counted, and the application + > may produce more results than the number of incoming messages. +- `auto_offset_reset`: Consumer `auto.offset.reset` setting +- `consumer_extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Consumer` as is. +- `producer_extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Producer` as is. +- `state_dir`: path to the application state directory. +Default - `"state"`. +- `rocksdb_options`: RocksDB options. +If `None`, the default options will be used. +- `consumer_poll_timeout`: timeout for `RowConsumer.poll()`. Default - `1.0`s +- `producer_poll_timeout`: timeout for `RowProducer.poll()`. Default - `0`s. +- `on_message_processed`: a callback triggered when message is successfully +processed. +- `loglevel`: a log level for "quixstreams" logger. +Should be a string or None. +If `None` is passed, no logging will be configured. +You may pass `None` and configure "quixstreams" logger +externally using `logging` library. +Default - `"INFO"`. +- `auto_create_topics`: Create all `Topic`s made via Application.topic() +Default - `True` +- `use_changelog_topics`: Use changelog topics to back stateful operations +Default - `True` +- `topic_manager`: A `TopicManager` instance +- `request_timeout`: timeout (seconds) for REST-based requests +- `topic_create_timeout`: timeout (seconds) for topic create finalization +- `processing_guarantee`: Use "exactly-once" or "at-least-once" processing. +

***Error Handlers***
+To handle errors, `Application` accepts callbacks triggered when + exceptions occur on different stages of stream processing. If the callback + returns `True`, the exception will be ignored. Otherwise, the exception + will be propagated and the processing will eventually stop. +- `on_consumer_error`: triggered when internal `RowConsumer` fails +to poll Kafka or cannot deserialize a message. +- `on_processing_error`: triggered when exception is raised within +`StreamingDataFrame.process()`. +- `on_producer_error`: triggered when `RowProducer` fails to serialize +or to produce a message to Kafka. +

***Quix Cloud Parameters***
+- `quix_config_builder`: instance of `QuixKafkaConfigsBuilder` to be used +instead of the default one. +> NOTE: It is recommended to just use `quix_sdk_token` instead. - + -#### Source.serialize +#### Application.Quix ```python -def serialize(key: Optional[object] = None, - value: Optional[object] = None, - headers: Optional[Headers] = None, - timestamp_ms: Optional[int] = None) -> KafkaMessage +@classmethod +def Quix(cls, *args, **kwargs) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L252) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L352) -Serialize data to bytes using the producer topic serializers and return a `quixstreams.models.messages.KafkaMessage`. - -**Returns**: +RAISES EXCEPTION: DEPRECATED. -`quixstreams.models.messages.KafkaMessage` +use Application() with "quix_sdk_token" parameter or set the "Quix__Sdk__Token" +environment variable. - + -#### Source.produce +#### Application.topic ```python -def produce(value: Optional[Union[str, bytes]] = None, - key: Optional[Union[str, bytes]] = None, - headers: Optional[Headers] = None, - partition: Optional[int] = None, - timestamp: Optional[int] = None, - poll_timeout: float = 5.0, - buffer_error_max_tries: int = 3) -> None +def topic(name: str, + value_deserializer: DeserializerType = "json", + key_deserializer: DeserializerType = "bytes", + value_serializer: SerializerType = "json", + key_serializer: SerializerType = "bytes", + config: Optional[TopicConfig] = None, + timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L268) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L384) -Produce a message to the configured source topic in Kafka. +Create a topic definition. - +Allows you to specify serialization that should be used when consuming/producing +to the topic in the form of a string name (i.e. "json" for JSON) or a +serialization class instance directly, like JSONSerializer(). -#### Source.flush -```python -def flush(timeout: Optional[float] = None) -> None -``` +Example Snippet: -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L293) +```python +from quixstreams import Application -This method flush the producer. +# Specify an input and output topic for a `StreamingDataFrame` instance, +# where the output topic requires adjusting the key serializer. -It ensures all messages are successfully delivered to Kafka. +app = Application() +input_topic = app.topic("input-topic", value_deserializer="json") +output_topic = app.topic( + "output-topic", key_serializer="str", value_serializer=JSONSerializer() +) +sdf = app.dataframe(input_topic) +sdf.to_topic(output_topic) +``` **Arguments**: -- `timeout` (`float`): time to attempt flushing (seconds). -None use producer default or -1 is infinite. Default: None - -**Raises**: - -- `CheckpointProducerTimeout`: if any message fails to produce before the timeout - - +- `name`: topic name +>***NOTE:*** If the application is created via `Quix.Application()`, +the topic name will be prefixed by Quix workspace id, and it will +be `-` +- `value_deserializer`: a deserializer type for values; default="json" +- `key_deserializer`: a deserializer type for keys; default="bytes" +- `value_serializer`: a serializer type for values; default="json" +- `key_serializer`: a serializer type for keys; default="bytes" +- `config`: optional topic configurations (for creation/validation) +>***NOTE:*** will not create without Application's auto_create_topics set +to True (is True by default) +- `timestamp_extractor`: a callable that returns a timestamp in +milliseconds from a deserialized message. Default - `None`. -#### Source.default\_topic +Example Snippet: ```python -def default_topic() -> Topic -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L311) +app = Application(...) -Return a default topic matching the source name. -The default topic will not be used if the topic has already been provided to the source. +def custom_ts_extractor( + value: Any, + headers: Optional[List[Tuple[str, bytes]]], + timestamp: float, + timestamp_type: TimestampType, +) -> int: + return value["timestamp"] -Note: if the default topic is used, the Application will prefix its name with "source__". +topic = app.topic("input-topic", timestamp_extractor=custom_ts_extractor) +``` **Returns**: -`quixstreams.models.topics.Topic` +`Topic` object - + -### StatefulSource +#### Application.dataframe ```python -class StatefulSource(Source) +def dataframe(topic: Optional[Topic] = None, + source: Optional[BaseSource] = None) -> StreamingDataFrame ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L330) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L464) -A `Source` class for custom Sources that need a state. +A simple helper method that generates a `StreamingDataFrame`, which is used -Subclasses are responsible for flushing, by calling `flush`, at reasonable intervals. +to define your message processing pipeline. -**Example**: +The topic is what the `StreamingDataFrame` will use as its input, unless +a source is provided (`topic` is optional when using a `source`). - -```python -import random -import time +If both `topic` AND `source` are provided, the source will write to that topic +instead of its default topic (which the `StreamingDataFrame` then consumes). -from quixstreams import Application -from quixstreams.sources import StatefulSource +See :class:`quixstreams.dataframe.StreamingDataFrame` for more details. +Example Snippet: -class RandomNumbersSource(StatefulSource): - def run(self): +```python +from quixstreams import Application - i = 0 - while self.running: - previous = self.state.get("number", 0) - current = random.randint(0, 100) - self.state.set("number", current) +# Set up an `app = Application` and `sdf = StreamingDataFrame`; +# add some operations to `sdf` and then run everything. - serialized = self._producer_topic.serialize(value=current + previous) - self.produce(key=str(current), value=serialized.value) - time.sleep(0.5) +app = Application(broker_address='localhost:9092', consumer_group='group') +topic = app.topic('test-topic') +df = app.dataframe(topic) +df.apply(lambda value, context: print('New message', value) - # flush the state every 10 messages - i += 1 - if i % 10 == 0: - self.flush() +app.run() +``` +**Arguments**: -def main(): - app = Application(broker_address="localhost:9092") - source = RandomNumbersSource(name="random-source") +- `topic`: a `quixstreams.models.Topic` instance +to be used as an input topic. +- `source`: a `quixstreams.sources` "BaseSource" instance - sdf = app.dataframe(source=source) - sdf.print(metadata=True) +**Returns**: - app.run() +`StreamingDataFrame` object + + +#### Application.stop -if __name__ == "__main__": - main() +```python +def stop(fail: bool = False) ``` - +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L520) -#### StatefulSource.\_\_init\_\_ +Stop the internal poll loop and the message processing. -```python -def __init__(name: str, shutdown_timeout: float = 10) -> None -``` +Only necessary when manually managing the lifecycle of the `Application` ( +likely through some sort of threading). -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L380) +To otherwise stop an application, either send a `SIGTERM` to the process +(like Kubernetes does) or perform a typical `KeyboardInterrupt` (`Ctrl+C`). **Arguments**: -- `name`: The source unique name. It is used to generate the topic configuration. -- `shutdown_timeout`: Time in second the application waits for the source to gracefully shutdown. +- `fail`: if True, signals that application is stopped due +to unhandled exception, and it shouldn't commit the current checkpoint. - + -#### StatefulSource.configure +#### Application.get\_producer ```python -def configure(topic: Topic, - producer: RowProducer, - *, - store_partition: Optional[StorePartition] = None, - **kwargs) -> None +def get_producer() -> Producer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L390) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L565) -This method is triggered before the source is started. +Create and return a pre-configured Producer instance. +The Producer is initialized with params passed to Application. -It configures the source's Kafka producer, the topic it will produce to and the store partition. +It's useful for producing data to Kafka outside the standard Application processing flow, +(e.g. to produce test data into a topic). +Using this within the StreamingDataFrame functions is not recommended, as it creates a new Producer +instance each time, which is not optimized for repeated use in a streaming pipeline. - +Example Snippet: -#### StatefulSource.store\_partitions\_count +```python +from quixstreams import Application + +app = Application(...) +topic = app.topic("input") + +with app.get_producer() as producer: + for i in range(100): + producer.produce(topic=topic.name, key=b"key", value=b"value") +``` + + + +#### Application.get\_consumer ```python -@property -def store_partitions_count() -> int +def get_consumer(auto_commit_enable: bool = True) -> Consumer ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L613) -Count of store partitions. +Create and return a pre-configured Consumer instance. -Used to configure the number of partition in the changelog topic. +The Consumer is initialized with params passed to Application. - +It's useful for consuming data from Kafka outside the standard +Application processing flow. +(e.g., to consume test data from a topic). +Using it within the StreamingDataFrame functions is not recommended, as it +creates a new Consumer instance +each time, which is not optimized for repeated use in a streaming pipeline. -#### StatefulSource.assigned\_store\_partition +Note: By default, this consumer does not autocommit the consumed offsets to allow +at-least-once processing. +To store the offset call store_offsets() after processing a message. +If autocommit is necessary set `enable.auto.offset.store` to True in +the consumer config when creating the app. + +Example Snippet: ```python -@property -def assigned_store_partition() -> int +from quixstreams import Application + +app = Application(...) +topic = app.topic("input") + +with app.get_consumer() as consumer: + consumer.subscribe([topic.name]) + while True: + msg = consumer.poll(timeout=1.0) + if msg is not None: + # Process message + # Optionally commit the offset + # consumer.store_offsets(msg) + ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L418) +**Arguments**: -The store partition assigned to this instance +- `auto_commit_enable`: Enable or disable auto commit +Default - True - + -#### StatefulSource.store\_name +#### Application.clear\_state ```python -@property -def store_name() -> str +def clear_state() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L425) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L663) -The source store name +Clear the state of the application. - + -#### StatefulSource.state +#### Application.add\_source ```python -@property -def state() -> State +def add_source(source: BaseSource, topic: Optional[Topic] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L432) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L669) -Access the `State` of the source. +Add a source to the application. -The `State` lifecycle is tied to the store transaction. A transaction is only valid until the next `.flush()` call. If no valid transaction exist, a new transaction is created. +Use when no transformations (which requires a `StreamingDataFrame`) are needed. -Important: after each `.flush()` call, a previously returned instance is invalidated and cannot be used. The property must be called again. +See :class:`quixstreams.sources.base.BaseSource` for more details. - +**Arguments**: -#### StatefulSource.flush +- `source`: a :class:`quixstreams.sources.BaseSource` instance +- `topic`: the :class:`quixstreams.models.Topic` instance the source will produce to +Default - the topic generated by the `source.default_topic()` method. +Note: the names of default topics are prefixed with "source__". + + + +#### Application.run ```python -def flush(timeout: Optional[float] = None) -> None +def run(dataframe: Optional[StreamingDataFrame] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L451) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L700) -This method commit the state and flush the producer. +Start processing data from Kafka using provided `StreamingDataFrame` -It ensures the state is published to the changelog topic and all messages are successfully delivered to Kafka. +Once started, it can be safely terminated with a `SIGTERM` signal +(like Kubernetes does) or a typical `KeyboardInterrupt` (`Ctrl+C`). -**Arguments**: -- `timeout` (`float`): time to attempt flushing (seconds). -None use producer default or -1 is infinite. Default: None +Example Snippet: -**Raises**: +```python +from quixstreams import Application -- `CheckpointProducerTimeout`: if any message fails to produce before the timeout +# Set up an `app = Application` and `sdf = StreamingDataFrame`; +# add some operations to `sdf` and then run everything. - +app = Application(broker_address='localhost:9092', consumer_group='group') +topic = app.topic('test-topic') +df = app.dataframe(topic) +df.apply(lambda value, context: print('New message', value) -## quixstreams.sources.base.manager +app.run() +``` - + -### SourceProcess +#### Application.setup\_topics ```python -class SourceProcess(process) +def setup_topics() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L30) - -An implementation of the Source subprocess. - -It manages a source and its subprocess, handles the communication between the child and parent processes, -lifecycle, and error handling. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L823) -Some methods are designed to be used from the parent process, and others from the child process. +Validate and create the topics - + -#### SourceProcess.run +### ApplicationConfig ```python -def run() -> None +class ApplicationConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L999) -An entrypoint of the child process. +Immutable object holding the application configuration -Responsible for: - * Configuring the signal handlers to handle shutdown properly - * Execution of the source `run` method - * Reporting the source exceptions to the parent process +For details see :class:`quixstreams.Application` - + -#### SourceProcess.raise\_for\_error +#### ApplicationConfig.settings\_customise\_sources ```python -def raise_for_error() -> None +@classmethod +def settings_customise_sources( + cls, settings_cls: Type[PydanticBaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + dotenv_settings: PydanticBaseSettingsSource, + file_secret_settings: PydanticBaseSettingsSource +) -> Tuple[PydanticBaseSettingsSource, ...] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L196) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L1034) -Raise a `quixstreams.sources.manager.SourceException` -if the child process was terminated with an exception. +Included to ignore reading/setting values from the environment - + -#### SourceProcess.stop +#### ApplicationConfig.copy ```python -def stop() +def copy(**kwargs) -> Self ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L220) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/app.py#L1047) -Handle shutdown of the source and its subprocess. +Update the application config and return a copy -First, it tries to shut down gracefully by sending a SIGTERM and waiting up to -`source.shutdown_timeout` seconds for the process to exit. If the process -is still alive, it will kill it with a SIGKILL. + - +## quixstreams.context -### SourceManager + + +#### set\_message\_context ```python -class SourceManager() +def set_message_context(context: Optional[MessageContext]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L243) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/context.py#L22) -Class managing the sources registered with the app +Set a MessageContext for the current message in the given `contextvars.Context` -Sources run in their separate process pay attention about cross-process communication +>***NOTE:*** This is for advanced usage only. If you need to change the message key, +`StreamingDataFrame.to_topic()` has an argument for it. - -#### SourceManager.register +Example Snippet: ```python -def register(source: BaseSource, topic, producer, consumer, - topic_manager) -> SourceProcess -``` +from quixstreams import Application, set_message_context, message_context -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L253) +# Changes the current sdf value based on what the message partition is. +def alter_context(value): + context = message_context() + if value > 1: + context.headers = context.headers + (b"cool_new_header", value.encode()) + set_message_context(context) -Register a new source in the manager. +app = Application() +sdf = app.dataframe() +sdf = sdf.update(lambda value: alter_context(value)) +``` -Each source need to already be configured, can't reuse a topic and must be unique +**Arguments**: - +- `context`: instance of `MessageContext` -#### SourceManager.raise\_for\_error + + +#### message\_context ```python -def raise_for_error() -> None +def message_context() -> Optional[MessageContext] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L304) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/context.py#L53) -Raise an exception if any process has stopped with an exception +Get a MessageContext for the current message, which houses most of the message - +metadata, like: + - key + - timestamp + - partition + - offset -#### SourceManager.is\_alive + +Example Snippet: ```python -def is_alive() -> bool -``` +from quixstreams import Application, message_context -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/manager.py#L311) +# Changes the current sdf value based on what the message partition is. -Check if any process is alive +app = Application() +sdf = app.dataframe() +sdf = sdf.apply(lambda value: 1 if message_context().partition == 2 else 0) +``` **Returns**: -True if at least one process is alive - - - -## quixstreams.sources.base.multiprocessing +instance of `MessageContext` @@ -12161,7 +12231,7 @@ True if at least one process is alive class RowConsumer(BaseConsumer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowconsumer.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowconsumer.py#L19) @@ -12178,7 +12248,7 @@ def __init__(broker_address: Union[str, ConnectionConfig], on_error: Optional[ConsumerErrorCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowconsumer.py#L20) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowconsumer.py#L20) A consumer class that is capable of deserializing Kafka messages to Rows @@ -12221,7 +12291,7 @@ def subscribe(topics: List[Topic], on_lost: Optional[RebalancingCallback] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowconsumer.py#L72) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowconsumer.py#L72) Set subscription to supplied list of topics. @@ -12250,7 +12320,7 @@ for example, may fail. def poll_row(timeout: Optional[float] = None) -> Union[Row, List[Row], None] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/rowconsumer.py#L106) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowconsumer.py#L106) Consumes a single message and deserialize it to Row or a list of Rows. @@ -12266,176 +12336,106 @@ If Kafka returns an error, it will be raised as exception. single Row, list of Rows or None - - -## quixstreams.checkpointing.checkpoint - - - -### BaseCheckpoint - -```python -class BaseCheckpoint() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L29) - -Base class to keep track of state updates and consumer offsets and to checkpoint these -updates on schedule. - -Two implementations exist: - * one for checkpointing the Application in quixstreams/checkpoint/checkpoint.py - * one for checkpointing the kafka source in quixstreams/sources/kafka/checkpoint.py - - - -#### BaseCheckpoint.expired - -```python -def expired() -> bool -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L58) + -Returns `True` if checkpoint deadline has expired OR -if the total number of processed offsets exceeded the "commit_every" limit -when it's defined. +## quixstreams.rowproducer - + -#### BaseCheckpoint.empty +### RowProducer ```python -def empty() -> bool +class RowProducer() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L68) - -Returns `True` if checkpoint doesn't have any offsets stored yet. - - - - -#### BaseCheckpoint.store\_offset - -```python -def store_offset(topic: str, partition: int, offset: int) -``` +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowproducer.py#L72) -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L75) +A producer class that is capable of serializing Rows to bytes and send them to Kafka. -Store the offset of the processed message to the checkpoint. +The serialization is performed according to the Topic serialization settings. **Arguments**: -- `topic`: topic name -- `partition`: partition number -- `offset`: message offset - - - -#### BaseCheckpoint.close - -```python -@abstractmethod -def close() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L102) - -Perform cleanup (when the checkpoint is empty) instead of committing. - -Needed for exactly-once, as Kafka transactions are timeboxed. +- `broker_address`: Connection settings for Kafka. +Accepts string with Kafka broker host and port formatted as `:`, +or a ConnectionConfig object if authentication is required. +- `extra_config`: A dictionary with additional options that +will be passed to `confluent_kafka.Producer` as is. +Note: values passed as arguments override values in `extra_config`. +- `on_error`: a callback triggered when `RowProducer.produce_row()` +or `RowProducer.poll()` fail`. +If producer fails and the callback returns `True`, the exception +will be logged but not propagated. +The default callback logs an exception and returns `False`. +- `flush_timeout`: The time the producer is waiting for all messages to be delivered. +- `transactional`: whether to use Kafka transactions or not. +Note this changes which underlying `Producer` class is used. - + -#### BaseCheckpoint.commit +#### RowProducer.produce\_row ```python -@abstractmethod -def commit() +def produce_row(row: Row, + topic: Topic, + key: Optional[Any] = _KEY_UNSET, + partition: Optional[int] = None, + timestamp: Optional[int] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L110) - -Commit the checkpoint. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowproducer.py#L119) - +Serialize Row to bytes according to the Topic serialization settings -### Checkpoint +and produce it to Kafka -```python -class Checkpoint(BaseCheckpoint) -``` +If this method fails, it will trigger the provided "on_error" callback. -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L117) +**Arguments**: -Checkpoint implementation used by the application +- `row`: Row object +- `topic`: Topic object +- `key`: message key, optional +- `partition`: partition number, optional +- `timestamp`: timestamp in milliseconds, optional - + -#### Checkpoint.get\_store\_transaction +#### RowProducer.poll ```python -def get_store_transaction( - topic: str, - partition: int, - store_name: str = DEFAULT_STATE_STORE_NAME) -> PartitionTransaction +def poll(timeout: float = 0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L147) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowproducer.py#L159) -Get a PartitionTransaction for the given store, topic and partition. +Polls the producer for events and calls `on_delivery` callbacks. -It will return already started transaction if there's one. +If `poll()` fails, it will trigger the provided "on_error" callback **Arguments**: -- `topic`: topic name -- `partition`: partition number -- `store_name`: store name - -**Returns**: - -instance of `PartitionTransaction` - - - -#### Checkpoint.close - -```python -def close() -``` - -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L170) - -Perform cleanup (when the checkpoint is empty) instead of committing. - -Needed for exactly-once, as Kafka transactions are timeboxed. +- `timeout`: timeout in seconds - + -#### Checkpoint.commit +#### RowProducer.abort\_transaction ```python -def commit() +def abort_transaction(timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/checkpointing/checkpoint.py#L179) - -Commit the checkpoint. - -This method will: - 1. Produce the changelogs for each state store - 2. Flush the producer to ensure everything is delivered. - 3. Commit topic offsets. - 4. Flush each state store partition to the disk. +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/rowproducer.py#L230) - +Attempt an abort if an active transaction. -## quixstreams.checkpointing +Else, skip since it throws an exception if at least +one transaction was successfully completed at some point. - +This avoids polluting the stack trace in the case where a transaction was +not active as expected (because of some other exception already raised) +and a cleanup abort is attempted. -## quixstreams.checkpointing.exceptions +NOTE: under normal circumstances a transaction will be open due to how +the Checkpoint inits another immediately after committing. diff --git a/docs/api-reference/serialization.md b/docs/api-reference/serialization.md index a7d137035..1a4ee71f0 100644 --- a/docs/api-reference/serialization.md +++ b/docs/api-reference/serialization.md @@ -10,7 +10,7 @@ class BytesDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L56) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L56) A deserializer to bypass bytes without any changes @@ -22,7 +22,7 @@ A deserializer to bypass bytes without any changes class BytesSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L65) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L65) A serializer to bypass bytes without any changes @@ -34,7 +34,7 @@ A serializer to bypass bytes without any changes class StringDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L74) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L74) @@ -46,7 +46,7 @@ class StringDeserializer(Deserializer) def __init__(codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L75) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L75) Deserializes bytes to strings using the specified encoding. @@ -65,7 +65,7 @@ A wrapper around `confluent_kafka.serialization.StringDeserializer`. class IntegerDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L93) Deserializes bytes to integers. @@ -79,7 +79,7 @@ A wrapper around `confluent_kafka.serialization.IntegerDeserializer`. class DoubleDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L111) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L111) Deserializes float to IEEE 764 binary64. @@ -93,7 +93,7 @@ A wrapper around `confluent_kafka.serialization.DoubleDeserializer`. class StringSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L129) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L129) @@ -105,7 +105,7 @@ class StringSerializer(Serializer) def __init__(codec: str = "utf_8") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L130) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L130) Serializes strings to bytes using the specified encoding. @@ -123,7 +123,7 @@ Serializes strings to bytes using the specified encoding. class IntegerSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L142) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L142) Serializes integers to bytes @@ -135,7 +135,7 @@ Serializes integers to bytes class DoubleSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/simple_types.py#L155) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/simple_types.py#L155) Serializes floats to bytes @@ -151,7 +151,7 @@ Serializes floats to bytes class JSONSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L32) @@ -169,7 +169,7 @@ def __init__( SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L33) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L33) Serializer that returns data in json format. @@ -197,7 +197,7 @@ Default - `None` class JSONDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L119) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L119) @@ -214,7 +214,7 @@ def __init__( ) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/json.py#L120) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/json.py#L120) Deserializer that parses data from JSON @@ -243,7 +243,7 @@ Default - `None` class AvroSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L26) @@ -262,7 +262,7 @@ def __init__( SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L27) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L27) Serializer that returns data in Avro format. @@ -293,7 +293,7 @@ Default - `None` class AvroDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L112) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L112) @@ -314,7 +314,7 @@ def __init__( ) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/avro.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/avro.py#L113) Deserializer that parses data from Avro. @@ -352,7 +352,7 @@ Default - `None` class ProtobufSerializer(Serializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L24) @@ -370,7 +370,7 @@ def __init__( SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L25) Serializer that returns data in protobuf format. @@ -399,7 +399,7 @@ Default - `None` class ProtobufDeserializer(Deserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L110) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L110) @@ -418,7 +418,7 @@ def __init__( SchemaRegistrySerializationConfig] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/protobuf.py#L111) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/protobuf.py#L111) Deserializer that parses protobuf data into a dictionary suitable for a StreamingDataframe. @@ -455,7 +455,7 @@ Default - `None` class SchemaRegistryClientConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/schema_registry.py#L22) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/schema_registry.py#L22) Configuration required to establish the connection with a Schema Registry. @@ -485,7 +485,7 @@ stored within the PEM as well. class SchemaRegistrySerializationConfig(BaseSettings) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/schema_registry.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/schema_registry.py#L48) Configuration that instructs Serializer how to handle communication with a @@ -529,7 +529,7 @@ then this property must be set to True until all old consumers have been upgrade class QuixDeserializer(JSONDeserializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L76) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L76) Handles Deserialization for any Quix-formatted topic. @@ -545,7 +545,7 @@ Parses JSON data from either `TimeseriesData` and `EventData` (ignores the rest) def __init__(loads: Callable[[Union[bytes, bytearray]], Any] = default_loads) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L83)
@@ -565,7 +565,7 @@ Default - :py:func:`quixstreams.utils.json.loads`. def split_values() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L100) Each Quix message might contain data for multiple Rows. This property informs the downstream processors about that, so they can @@ -582,7 +582,7 @@ def deserialize(model_key: str, value: Union[List[Mapping], Mapping]) -> Iterable[Mapping] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L153) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L153) Deserialization function for particular data types (Timeseries or EventData). @@ -607,7 +607,7 @@ Iterable of dicts class QuixTimeseriesSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L321) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L321) Serialize data to JSON formatted according to Quix Timeseries format. @@ -639,7 +639,7 @@ Output: class QuixEventsSerializer(QuixSerializer) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/serializers/quix.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/serializers/quix.py#L409) Serialize data to JSON formatted according to Quix EventData format. The input value is expected to be a dictionary with the following keys: diff --git a/docs/api-reference/sinks.md b/docs/api-reference/sinks.md index dfffbeaaa..26094a66a 100644 --- a/docs/api-reference/sinks.md +++ b/docs/api-reference/sinks.md @@ -10,7 +10,7 @@ class BaseSink(abc.ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L11) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L11) This is a base class for all sinks. @@ -29,7 +29,7 @@ Note that Sinks are currently in beta, and their design may change over time. def flush(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L21) This method is triggered by the Checkpoint class when it commits. @@ -51,7 +51,7 @@ def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L33) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L33) This method is triggered on every new processed record being sent to this sink. @@ -69,7 +69,7 @@ on flush(). def on_paused(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L51) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L51) This method is triggered when the sink is paused due to backpressure, when the `SinkBackpressureError` is raised. @@ -84,7 +84,7 @@ Here you can react to the backpressure events. class BatchingSink(BaseSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L60) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L60) A base class for batching sinks, that need to accumulate the data first before sending it to the external destinatios. @@ -108,7 +108,7 @@ batching sink. def write(batch: SinkBatch) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L83) This method implements actual writing to the external destination. @@ -128,7 +128,7 @@ def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, topic: str, partition: int, offset: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L93) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L93) Add a new record to in-memory batch. @@ -142,7 +142,7 @@ Add a new record to in-memory batch. def flush(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L115) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L115) Flush an accumulated batch to the destination and drop it afterward. @@ -156,7 +156,7 @@ Flush an accumulated batch to the destination and drop it afterward. def on_paused(topic: str, partition: int) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/sink.py#L135) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/sink.py#L135) When the destination is already backpressure, drop the accumulated batch. @@ -172,7 +172,7 @@ When the destination is already backpressure, drop the accumulated batch. class SinkBatch() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/batch.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/batch.py#L12) A batch to accumulate processed data by `BatchingSink` between the checkpoints. @@ -195,7 +195,7 @@ Batches are created automatically by the implementations of `BatchingSink`. def iter_chunks(n: int) -> Iterable[Iterable[SinkItem]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/batch.py#L69) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/batch.py#L69) Iterate over batch data in chunks of length n. The last batch may be shorter. @@ -212,7 +212,7 @@ The last batch may be shorter. class SinkBackpressureError(QuixException) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/base/exceptions.py#L6) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/base/exceptions.py#L6) An exception to be raised by Sinks during flush() call @@ -242,7 +242,7 @@ a timeout specified in `retry_after`, and resume it when it's elapsed. class InfluxDB3Sink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/influxdb3.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/influxdb3.py#L23) @@ -267,7 +267,7 @@ def __init__(token: str, debug: bool = False) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/influxdb3.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/influxdb3.py#L24) A connector to sink processed data to InfluxDB v3. @@ -336,7 +336,7 @@ Default - `False`. class CSVSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/csv.py#L9) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/csv.py#L9) @@ -351,7 +351,7 @@ def __init__(path: str, value_serializer: Callable[[Any], str] = json.dumps) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/core/csv.py#L10) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/core/csv.py#L10) A base CSV sink that writes data from all assigned partitions to a single file. @@ -385,7 +385,7 @@ Default - `json.dumps`. class AWSIcebergConfig(BaseIcebergConfig) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L42) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L42) @@ -401,7 +401,7 @@ def __init__(aws_s3_uri: str, aws_session_token: Optional[str] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L43) Configure IcebergSink to work with AWS Glue. @@ -430,7 +430,7 @@ using AWS Glue. class IcebergSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L76) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L76) IcebergSink writes batches of data to an Apache Iceberg table. @@ -501,7 +501,7 @@ if __name__ == "__main__": def write(batch: SinkBatch) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/iceberg.py#L174) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/iceberg.py#L174) Writes a batch of data to the Iceberg table. @@ -525,7 +525,7 @@ Implements retry logic to handle concurrent write conflicts. class BigQuerySink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/bigquery.py#L53) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/bigquery.py#L53) @@ -546,7 +546,7 @@ def __init__(project_id: str, **kwargs) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/bigquery.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/bigquery.py#L54) A connector to sink processed data to Google Cloud BigQuery. @@ -602,7 +602,7 @@ to the client's default retrying policy. class FileSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/sink.py#L11) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/sink.py#L11) A sink that writes data batches to files using configurable formats and destinations. @@ -628,7 +628,7 @@ def __init__(directory: str = "", destination: Optional[Destination] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/sink.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/sink.py#L25) Initialize the FileSink with the specified configuration. @@ -653,7 +653,7 @@ LocalDestination if not specified. def write(batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/sink.py#L46) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/sink.py#L46) Write a batch of data using the configured format and destination. @@ -685,7 +685,7 @@ that the sink needs backpressure with a 5-second retry delay. class Destination(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L16) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L16) Abstract base class for defining where and how data should be stored. @@ -704,7 +704,7 @@ and partitions. def set_directory(directory: str) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L28) Configure the base directory for storing files. @@ -730,7 +730,7 @@ underscores are allowed. def set_extension(format: Format) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L45) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L45) Set the file extension based on the format. @@ -751,7 +751,7 @@ Set the file extension based on the format. def write(data: bytes, batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/base.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/base.py#L54) Write the serialized data to storage. @@ -775,7 +775,7 @@ details. class LocalDestination(Destination) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L15) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L15) A destination that writes data to the local filesystem. @@ -792,7 +792,7 @@ and appending to existing ones. def __init__(append: bool = False) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L22) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L22) Initialize the local destination. @@ -813,7 +813,7 @@ ones. Defaults to False. def set_extension(format: Format) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L32) Set the file extension and validate append mode compatibility. @@ -838,7 +838,7 @@ support appending. def write(data: bytes, batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/local.py#L43) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/local.py#L43) Write data to a local file. @@ -861,7 +861,7 @@ Write data to a local file. class S3BucketNotFoundError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L13) Raised when the specified S3 bucket does not exist. @@ -873,7 +873,7 @@ Raised when the specified S3 bucket does not exist. class S3BucketAccessDeniedError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L17) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L17) Raised when the specified S3 bucket access is denied. @@ -885,7 +885,7 @@ Raised when the specified S3 bucket access is denied. class S3Destination(Destination) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L21) A destination that writes data to Amazon S3. @@ -908,7 +908,7 @@ def __init__(bucket: str, **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L28) Initialize the S3 destination. @@ -940,7 +940,7 @@ AWS_DEFAULT_REGION environment variable. def write(data: bytes, batch: SinkBatch) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/destinations/s3.py#L78) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/destinations/s3.py#L78) Write data to S3. @@ -963,7 +963,7 @@ Write data to S3. class Format(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L8) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L8) Base class for formatting batches in file sinks. @@ -984,7 +984,7 @@ formatted and saved. def file_extension() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L20) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L20) Returns the file extension used for output files. @@ -1006,7 +1006,7 @@ The file extension as a string. def supports_append() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L30) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L30) Indicates if the format supports appending data to an existing file. @@ -1027,7 +1027,7 @@ True if appending is supported, otherwise False. def serialize(batch: SinkBatch) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/base.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/base.py#L39) Serializes a batch of messages into bytes. @@ -1055,7 +1055,7 @@ The serialized batch as bytes. class JSONFormat(Format) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L14) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L14) Serializes batches of messages into JSON Lines format with optional gzip compression. @@ -1078,7 +1078,7 @@ def __init__(file_extension: str = ".jsonl", dumps: Optional[Callable[[Any], str]] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L28) Initializes the JSONFormat. @@ -1104,7 +1104,7 @@ strings. If provided, the `compact` option is ignored. def file_extension() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L57) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L57) Returns the file extension used for output files. @@ -1124,7 +1124,7 @@ The file extension as a string. def serialize(batch: SinkBatch) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/json.py#L65) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/json.py#L65) Serializes a `SinkBatch` into bytes in JSON Lines format. @@ -1157,7 +1157,7 @@ compressed with gzip. class ParquetFormat(Format) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L16) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L16) Serializes batches of messages into Parquet format. @@ -1178,7 +1178,7 @@ def __init__(file_extension: str = ".parquet", compression: Compression = "snappy") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L29) Initializes the ParquetFormat. @@ -1203,7 +1203,7 @@ or "zstd". Defaults to "snappy". def file_extension() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L47) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L47) Returns the file extension used for output files. @@ -1223,7 +1223,7 @@ The file extension as a string. def serialize(batch: SinkBatch) -> bytes ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/file/formats/parquet.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/file/formats/parquet.py#L55) Serializes a `SinkBatch` into bytes in Parquet format. @@ -1258,7 +1258,7 @@ The serialized batch as bytes in Parquet format. class PubSubTopicNotFoundError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L25) Raised when the specified topic does not exist. @@ -1270,7 +1270,7 @@ Raised when the specified topic does not exist. class PubSubSink(BaseSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L29) A sink that publishes messages to Google Cloud Pub/Sub. @@ -1290,7 +1290,7 @@ def __init__(project_id: str, **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L32) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L32) Initialize the PubSubSink. @@ -1322,7 +1322,7 @@ def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, topic: str, partition: int, offset: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L81) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L81) Publish a message to Pub/Sub. @@ -1336,7 +1336,7 @@ Publish a message to Pub/Sub. def flush(topic: str, partition: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/pubsub.py#L114) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/pubsub.py#L114) Wait for all publish operations to complete successfully. @@ -1352,7 +1352,7 @@ Wait for all publish operations to complete successfully. class PostgreSQLSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/postgresql.py#L48) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/postgresql.py#L48) @@ -1371,7 +1371,7 @@ def __init__(host: str, **kwargs) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/postgresql.py#L49) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/postgresql.py#L49) A connector to sink topic data to PostgreSQL. @@ -1401,7 +1401,7 @@ A connector to sink topic data to PostgreSQL. class KinesisStreamNotFoundError(Exception) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L23) Raised when the specified Kinesis stream does not exist. @@ -1413,7 +1413,7 @@ Raised when the specified Kinesis stream does not exist. class KinesisSink(BaseSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L27) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L27) @@ -1433,7 +1433,7 @@ def __init__(stream_name: str, **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L28) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L28) Initialize the KinesisSink. @@ -1462,7 +1462,7 @@ def add(value: Any, key: Any, timestamp: int, headers: HeadersTuples, topic: str, partition: int, offset: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L80) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L80) Buffer a record for the Kinesis stream. @@ -1480,7 +1480,7 @@ will be sent when the flush method is called. def flush(topic: str, partition: int) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/kinesis.py#L110) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/kinesis.py#L110) Flush all buffered records for a given topic-partition. @@ -1501,7 +1501,7 @@ stream. class RedisSink(BatchingSink) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/redis.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/redis.py#L21) @@ -1521,7 +1521,7 @@ def __init__(host: str, **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sinks/community/redis.py#L22) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sinks/community/redis.py#L22) A connector to sink processed data to Redis. diff --git a/docs/api-reference/sources.md b/docs/api-reference/sources.md index 7eb102d85..78aa399e6 100644 --- a/docs/api-reference/sources.md +++ b/docs/api-reference/sources.md @@ -10,7 +10,7 @@ class BaseSource(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L17) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L17) This is the base class for all sources. @@ -83,7 +83,7 @@ if __name__ == "__main__": def configure(topic: Topic, producer: RowProducer, **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L88) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L88) This method is triggered before the source is started. @@ -100,7 +100,7 @@ It configures the source's Kafka producer, the topic it will produce to and opti def start() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L110) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L110) This method is triggered in the subprocess when the source is started. @@ -118,7 +118,7 @@ Use it to fetch data and produce it to Kafka. def stop() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L119) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L119) This method is triggered when the application is shutting down. @@ -135,7 +135,7 @@ The source must ensure that the `run` method is completed soon. def default_topic() -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L127) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L127) This method is triggered when the topic is not provided to the source. @@ -151,7 +151,7 @@ Note: if the default topic is used, the Application will prefix its name with "s class Source(BaseSource) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L137) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L137) A base class for custom Sources that provides a basic implementation of `BaseSource` interface. @@ -211,7 +211,7 @@ if __name__ == "__main__": def __init__(name: str, shutdown_timeout: float = 10) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L187) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L187)
@@ -231,7 +231,7 @@ def __init__(name: str, shutdown_timeout: float = 10) -> None def running() -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L201) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L201) Property indicating if the source is running. @@ -247,7 +247,7 @@ The `stop` method will set it to `False`. Use it to stop the source gracefully. def cleanup(failed: bool) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L209) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L209) This method is triggered once the `run` method completes. @@ -265,7 +265,7 @@ It flushes the producer when `_run` completes successfully. def stop() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L220) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L220) This method is triggered when the application is shutting down. @@ -281,7 +281,7 @@ It sets the `running` property to `False`. def start() -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L228) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L228) This method is triggered in the subprocess when the source is started. @@ -298,7 +298,7 @@ It marks the source as running, execute it's run method and ensure cleanup happe def run() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L244) This method is triggered in the subprocess when the source is started. @@ -318,7 +318,7 @@ def serialize(key: Optional[object] = None, timestamp_ms: Optional[int] = None) -> KafkaMessage ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L252) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L252) Serialize data to bytes using the producer topic serializers and return a `quixstreams.models.messages.KafkaMessage`. @@ -344,7 +344,7 @@ def produce(value: Optional[Union[str, bytes]] = None, buffer_error_max_tries: int = 3) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L268) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L268) Produce a message to the configured source topic in Kafka. @@ -358,7 +358,7 @@ Produce a message to the configured source topic in Kafka. def flush(timeout: Optional[float] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L293) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L293) This method flush the producer. @@ -385,7 +385,7 @@ None use producer default or -1 is infinite. Default: None def default_topic() -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L311) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L311) Return a default topic matching the source name. @@ -407,7 +407,7 @@ Note: if the default topic is used, the Application will prefix its name with "s class StatefulSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L330) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L330) A `Source` class for custom Sources that need a state. @@ -467,7 +467,7 @@ if __name__ == "__main__": def __init__(name: str, shutdown_timeout: float = 10) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L380) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L380)
@@ -490,7 +490,7 @@ def configure(topic: Topic, **kwargs) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L390) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L390) This method is triggered before the source is started. @@ -507,7 +507,7 @@ It configures the source's Kafka producer, the topic it will produce to and the def store_partitions_count() -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L409) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L409) Count of store partitions. @@ -524,7 +524,7 @@ Used to configure the number of partition in the changelog topic. def assigned_store_partition() -> int ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L418) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L418) The store partition assigned to this instance @@ -539,7 +539,7 @@ The store partition assigned to this instance def store_name() -> str ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L425) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L425) The source store name @@ -554,7 +554,7 @@ The source store name def state() -> State ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L432) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L432) Access the `State` of the source. @@ -572,7 +572,7 @@ Important: after each `.flush()` call, a previously returned instance is invalid def flush(timeout: Optional[float] = None) -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/base/source.py#L451) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/base/source.py#L451) This method commit the state and flush the producer. @@ -601,7 +601,7 @@ None use producer default or -1 is infinite. Default: None class CSVSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/csv.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/csv.py#L13) @@ -620,7 +620,7 @@ def __init__(path: Union[str, Path], dialect: str = "excel") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/csv.py#L14) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/csv.py#L14) A base CSV source that reads data from a CSV file and produces rows @@ -660,7 +660,7 @@ Default - `"excel"`. class KafkaReplicatorSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/kafka.py#L25) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/kafka.py#L25) Source implementation that replicates a topic from a Kafka broker to your application broker. @@ -711,7 +711,7 @@ def __init__( key_deserializer: DeserializerType = "bytes") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/kafka.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/kafka.py#L54)
@@ -750,7 +750,7 @@ Default - `json` class QuixEnvironmentSource(KafkaReplicatorSource) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/quix.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/quix.py#L19) Source implementation that replicates a topic from a Quix Cloud environment to your application broker. It can copy messages for development and testing without risking producing them back or affecting the consumer groups. @@ -805,7 +805,7 @@ def __init__( key_deserializer: DeserializerType = "bytes") -> None ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/core/kafka/quix.py#L50) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/core/kafka/quix.py#L50)
@@ -830,7 +830,7 @@ For other parameters See `quixstreams.sources.kafka.KafkaReplicatorSource` class FileSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/file.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/file.py#L19) Ingest a set of files from a desired origin into Kafka by iterating through the provided folder and processing all nested files within it. @@ -905,7 +905,7 @@ def __init__(directory: Union[str, Path], shutdown_timeout: float = 10) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/file.py#L79) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/file.py#L79)
@@ -936,7 +936,7 @@ to gracefully shutdown def default_topic() -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/file.py#L152) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/file.py#L152) Uses the file structure to generate the desired partition count for the @@ -968,7 +968,7 @@ the original default topic, with updated partition count class S3Origin(Origin) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/origins/s3.py#L23) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/origins/s3.py#L23) @@ -985,7 +985,7 @@ def __init__( endpoint_url: Optional[str] = getenv("AWS_ENDPOINT_URL_S3")) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/origins/s3.py#L24) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/origins/s3.py#L24) Configure IcebergSink to work with AWS Glue. @@ -1016,7 +1016,7 @@ NOTE: can alternatively set the AWS_ENDPOINT_URL_S3 environment variable class JSONFormat(Format) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/json.py#L12) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/json.py#L12) @@ -1029,7 +1029,7 @@ def __init__(compression: Optional[CompressionName], loads: Optional[Callable[[str], dict]] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/file/formats/json.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/file/formats/json.py#L13) Read a JSON-formatted file (along with decompressing it). @@ -1057,7 +1057,7 @@ with {_key: str, _value: dict, _timestamp: int}. class KinesisSource(StatefulSource) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/kinesis.py#L18) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/kinesis.py#L18) NOTE: Requires `pip install quixstreams[kinesis]` to work. @@ -1115,7 +1115,7 @@ def __init__( retry_backoff_secs: float = 5.0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/kinesis/kinesis.py#L57) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/kinesis/kinesis.py#L57)
@@ -1152,7 +1152,7 @@ shard when Kinesis consumer encounters handled/expected errors. class PubSubSource(Source) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/pubsub.py#L16) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/pubsub.py#L16) This source enables reading from a Google Cloud Pub/Sub topic, dumping it to a kafka topic using desired SDF-based transformations. @@ -1208,7 +1208,7 @@ def __init__(project_id: str, shutdown_timeout: float = 10.0) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/sources/community/pubsub/pubsub.py#L55) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/sources/community/pubsub/pubsub.py#L55)
diff --git a/docs/api-reference/state.md b/docs/api-reference/state.md index 4ff06cc2e..647102731 100644 --- a/docs/api-reference/state.md +++ b/docs/api-reference/state.md @@ -10,7 +10,7 @@ class State(ABC) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L13) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L13) Primary interface for working with key-value state data from `StreamingDataFrame` @@ -25,7 +25,7 @@ Primary interface for working with key-value state data from `StreamingDataFrame def get(key: Any, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L19) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L19) Get the value for key if key is present in the state, else default @@ -53,7 +53,7 @@ value or None if the key is not found and `default` is not provided def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L30) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L30) Set value for the key. @@ -75,7 +75,7 @@ Set value for the key. def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L39) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L39) Delete value for the key. @@ -98,7 +98,7 @@ This function always returns `None`, even if value is not found. def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L49) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L49) Check if the key exists in state. @@ -122,7 +122,7 @@ True if key exists, False otherwise class TransactionState(State) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L58) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L58) @@ -134,7 +134,7 @@ class TransactionState(State) def __init__(prefix: bytes, transaction: "PartitionTransaction") ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L64) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L64) Simple key-value state to be provided into `StreamingDataFrame` functions @@ -154,7 +154,7 @@ Simple key-value state to be provided into `StreamingDataFrame` functions def get(key: Any, default: Any = None) -> Optional[Any] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L73) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L73) Get the value for key if key is present in the state, else default @@ -181,7 +181,7 @@ value or None if the key is not found and `default` is not provided def set(key: Any, value: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L83) Set value for the key. @@ -202,7 +202,7 @@ Set value for the key. def delete(key: Any) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L91) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L91) Delete value for the key. @@ -224,7 +224,7 @@ This function always returns `None`, even if value is not found. def exists(key: Any) -> bool ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/base/state.py#L100) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/base/state.py#L100) Check if the key exists in state. @@ -253,7 +253,7 @@ True if key exists, False otherwise class RocksDBOptions(RocksDBOptionsType) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/options.py#L26) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/options.py#L26) RocksDB database options. @@ -278,7 +278,7 @@ Please see `rocksdict.Options` for a complete description of other options. def to_options() -> rocksdict.Options ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/state/rocksdb/options.py#L54) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/state/rocksdb/options.py#L54) Convert parameters to `rocksdict.Options` diff --git a/docs/api-reference/topics.md b/docs/api-reference/topics.md index 201671c52..c0dd06403 100644 --- a/docs/api-reference/topics.md +++ b/docs/api-reference/topics.md @@ -16,7 +16,7 @@ def convert_topic_list(topics: List[Topic]) -> List[ConfluentTopic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L29) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L29) Converts `Topic`s to `ConfluentTopic`s as required for Confluent's @@ -42,7 +42,7 @@ list of confluent_kafka `ConfluentTopic`s class TopicAdmin() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L52) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L52) For performing "admin"-level operations on a Kafka cluster, mostly around topics. @@ -60,7 +60,7 @@ def __init__(broker_address: Union[str, ConnectionConfig], extra_config: Optional[Mapping] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L59) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L59)
@@ -82,7 +82,7 @@ or a ConnectionConfig object if authentication is required. def list_topics(timeout: float = -1) -> Dict[str, ConfluentTopicMetadata] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L91) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L91) Get a list of topics and their metadata from a Kafka cluster @@ -109,7 +109,7 @@ def inspect_topics(topic_names: List[str], timeout: float = 30) -> Dict[str, Optional[TopicConfig]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L102) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L102) A simplified way of getting the topic configurations of the provided topics @@ -141,7 +141,7 @@ def create_topics(topics: List[Topic], finalize_timeout: float = 60) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/admin.py#L184) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/admin.py#L184) Create the given list of topics and confirm they are ready. @@ -170,7 +170,7 @@ fail (it ignores issues for a topic already existing). class TopicConfig() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L42) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L42) Represents all kafka-level configuration for a kafka topic. @@ -184,7 +184,7 @@ Generally used by Topic and any topic creation procedures. class Topic() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L83) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L83) A definition of a Kafka topic. @@ -209,7 +209,7 @@ def __init__( timestamp_extractor: Optional[TimestampExtractor] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L92) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L92)
@@ -234,7 +234,7 @@ milliseconds from a deserialized message. def row_serialize(row: Row, key: Any) -> KafkaMessage ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L140) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L140) Serialize Row to a Kafka message structure @@ -262,7 +262,7 @@ def row_deserialize( message: ConfluentKafkaMessageProto) -> Union[Row, List[Row], None] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/topic.py#L180) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/topic.py#L180) Deserialize incoming Kafka message to a Row. @@ -292,7 +292,7 @@ Row, list of Rows or None if the message is ignored. def affirm_ready_for_create(topics: List[Topic]) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L21) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L21) Validate a list of topics is ready for creation attempt @@ -310,7 +310,7 @@ Validate a list of topics is ready for creation attempt class TopicManager() ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L31) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L31) The source of all topic management for a Quix Streams Application. @@ -332,7 +332,7 @@ def __init__(topic_admin: TopicAdmin, auto_create_topics: bool = True) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L52) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L52)
@@ -354,7 +354,7 @@ def __init__(topic_admin: TopicAdmin, def changelog_topics() -> Dict[Optional[str], Dict[str, Topic]] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L104) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L104) Note: `Topic`s are the changelogs. @@ -371,7 +371,7 @@ returns: the changelog topic dict, {topic_name: {suffix: Topic}} def all_topics() -> Dict[str, Topic] ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L113) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L113) Every registered topic name mapped to its respective `Topic`. @@ -389,7 +389,7 @@ def topic_config(num_partitions: Optional[int] = None, extra_config: Optional[dict] = None) -> TopicConfig ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L223) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L223) Convenience method for generating a `TopicConfig` with default settings @@ -423,7 +423,7 @@ def topic(name: str, timestamp_extractor: Optional[TimestampExtractor] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L244) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L244) A convenience method for generating a `Topic`. Will use default config options @@ -458,7 +458,7 @@ Topic object with creation configs def register(topic: Topic) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L290) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L290) Register an already generated :class:`quixstreams.models.topics.Topic` to the topic manager. @@ -486,7 +486,7 @@ def repartition_topic(operation: str, timeout: Optional[float] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L308) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L308) Create an internal repartition topic. @@ -521,7 +521,7 @@ def changelog_topic(topic_name: Optional[str], timeout: Optional[float] = None) -> Topic ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L348) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L348) Performs all the logic necessary to generate a changelog topic based on an @@ -569,7 +569,7 @@ def create_topics(topics: List[Topic], create_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L416) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L416) Creates topics via an explicit list of provided `Topics`. @@ -595,7 +595,7 @@ def create_all_topics(timeout: Optional[float] = None, create_timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L444) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L444) A convenience method to create all Topic objects stored on this TopicManager. @@ -618,7 +618,7 @@ If `auto_create_topics` is set to False no topic will be created. def validate_all_topics(timeout: Optional[float] = None) ``` -[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/main/quixstreams/models/topics/manager.py#L460) +[[VIEW SOURCE]](https://github.com/quixio/quix-streams/blob/chore/example-cleanup/quixstreams/models/topics/manager.py#L460) Validates all topics exist and changelogs have correct topic and rep factor. diff --git a/docs/build/README.md b/docs/build/README.md index 086b36d31..c5e3e9111 100644 --- a/docs/build/README.md +++ b/docs/build/README.md @@ -3,9 +3,16 @@ Generate API docs for `quixstreams` module using [Pydoc Markdown](https://niklasrosenstein.github.io/pydoc-markdown/just-generate-me-some-markdown/). -To generate new API docs +## Generate new API docs - Go to `docs/build` - Install requirements via `python -m pip install -r requirements.txt` - do `./build.sh` - Check the generated docs in `docs/` folder + + +## Render/View Docs: +- Go to `docs/build` +- `python -m pip install mkdocs, mkdocs-material, mkdocs-material-extensions` +- `mkdocs serve -f ../../mkdocs.yml` +- [navigate to `localhost:8000` in browser](`http://localhost:8000`) diff --git a/docs/tutorials/anomaly-detection/tutorial.md b/docs/tutorials/anomaly-detection/tutorial.md index 7ec907a8c..09576585c 100644 --- a/docs/tutorials/anomaly-detection/tutorial.md +++ b/docs/tutorials/anomaly-detection/tutorial.md @@ -30,9 +30,6 @@ We will use a [Quix Streams `Source`](../../connectors/sources/README.md) to gen These events will be processed by our new Anomaly Detector `Application`. -NOTE: our example uses JSON formatting for Kafka message values. - - ## Alerting Approach (Windowing) @@ -112,10 +109,13 @@ app = Application( Create a [Quix Streams Application](../../configuration.md), which is our constructor for everything! We provide it our connection settings, consumer group (ideally unique per Application), -and where the consumer group should start from on the (internal) Source topic. +and where the consumer group should start from on the (internal) `Source` topic. + +!!! TIP + + Once you are more familiar with Kafka, we recommend + [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). -> [!TIP] -> Once you are more familiar with Kafka, we recommend [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). @@ -125,8 +125,10 @@ and where the consumer group should start from on the (internal) Source topic. Create one for each topic used by your `Application`. -> [!NOTE] -> Any missing topics will be automatically created for you upon running the application. +!!! NOTE + + Any missing topics will be automatically created for you upon running an `Application`. + #### Our Topics We have one output topic, named `price_updates`: @@ -147,7 +149,14 @@ Now for the fun part: building our [StreamingDataFrame](../../processing.md#intr SDF allows manipulating the message value in a dataframe-like fashion using various operations. -After initializing, we continue re-assigning to the same `sdf` variable as we add operations. +After initializing with either a `Topic` or `Source`, we continue reassigning to the +same `sdf` variable as we add operations. + +!!! NOTE + + A few `StreamingDataFrame` operations are + ["in-place"](../../advanced/dataframe-assignments.md#valid-in-place-operations), + like `.print()`. (Also: notice that we pass our input `Topic` from the previous step to it.) @@ -228,7 +237,11 @@ sdf = sdf.to_topic(alerts_topic) However, if the value ended up >= 90....we finally finish by producing our alert to our downstream topic via [`SDF.to_topic(T)`](../../processing.md#writing-data-to-kafka-topics), where `T` is our previously defined `Topic` (not the topic name!). -NOTE: because we use "Current" windowing, we may produce a lot of "duplicate" alerts once triggered...you could solve this in numerous ways downstream. What we care about is alerting as soon as possible! +!!! NOTE + + Because we use "Current" windowing, we may produce a lot of "duplicate" alerts once + triggered...you could solve this in numerous ways downstream. What we care about is + alerting as soon as possible! diff --git a/docs/tutorials/purchase-filtering/tutorial.md b/docs/tutorials/purchase-filtering/tutorial.md index d618e93c8..0f2d39e97 100644 --- a/docs/tutorials/purchase-filtering/tutorial.md +++ b/docs/tutorials/purchase-filtering/tutorial.md @@ -34,7 +34,6 @@ We will use a [Quix Streams `Source`](../../connectors/sources/README.md) to gen processed by our new Purchase Filtering `Application`. - ## Important Takeaways The primary lesson: learning how you can use common pandas-like @@ -120,10 +119,12 @@ app = Application( Create a [Quix Streams Application](../../configuration.md), which is our constructor for everything! We provide it our connection settings, consumer group (ideally unique per Application), -and where the consumer group should start from on the (internal) Source topic. +and where the consumer group should start from on the (internal) `Source` topic. + +!!! TIP -> [!TIP] -> Once you are more familiar with Kafka, we recommend [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). + Once you are more familiar with Kafka, we recommend + [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). @@ -133,8 +134,9 @@ and where the consumer group should start from on the (internal) Source topic. Create one for each topic used by your `Application`. -> [!NOTE] -> Any missing topics will be automatically created for you upon running the application. +!!! NOTE + + Any missing topics will be automatically created for you upon running an `Application`. #### Our Topics We have one output topic, named `customers_coupon_qualified`: @@ -155,7 +157,14 @@ Now for the fun part: building our [StreamingDataFrame](../../processing.md#intr SDF allows manipulating the message value in a dataframe-like fashion using various operations. -After initializing, we continue re-assigning to the same `sdf` variable as we add operations. +After initializing with either a `Topic` or `Source`, we continue reassigning to the +same `sdf` variable as we add operations. + +!!! NOTE + + A few `StreamingDataFrame` operations are + ["in-place"](../../advanced/dataframe-assignments.md#valid-in-place-operations), + like `.print()`. (Also: notice that we pass our input `Topic` from the previous step to it.) @@ -203,11 +212,13 @@ sdf["Membership Type"].isin(["Silver", "Gold"]) We additionally showcase one of our built-in column operations `.isin()`, a way for SDF to perform an `if x in y` check (SDF is declaratively defined, invalidating that approach). -**NOTE**: some operations (like `.isin()`) are only available when manipulating a column. +!!! INFO + + Some operations (like `.isin()`) are only available when manipulating a column. - - if you're unsure what's possible, autocomplete often covers you! + - if you're unsure what's possible, autocomplete often covers you! - - _ADVANCED_: [complete list of column operations](../../api-reference/dataframe.md#streamingseries). + - _ADVANCED_: [complete list of column operations](../../api-reference/dataframe.md#streamingseries).
@@ -292,7 +303,9 @@ becomes >>> {"Email": "cool email", "Full Name": "cool name"}` ``` -NOTE: you cannot reference nested keys in this way. +!!! WARNING + + You cannot reference nested keys in this way. @@ -305,8 +318,10 @@ sdf = sdf.to_topic(customers_qualified_topic) Finally, we produce our non-filtered results downstream via [`SDF.to_topic(T)`](../../processing.md#writing-data-to-kafka-topics), where `T` is our previously defined `Topic` (not the topic name!). -NOTE: by default, our outgoing Kafka key is persisted from the input message. -[You can alter it](../../processing.md#changing-message-key-before-producing), if needed. +!!! INFO + + By default, our outgoing Kafka key is persisted from the input message. + [You can alter it](../../processing.md#changing-message-key-before-producing), if needed. ### Running an Application @@ -349,7 +364,11 @@ One thing to keep in mind is that the Quix Streams does not log/print any messag operations by default. To get visual outputs around message processing, you can either: + - use [recommended way of printing/logging with SDF](../../processing.md#debugging) - + - use `DEBUG` mode via `Application(loglevel="DEBUG")` - - WARNING: you should NOT run your applications in `DEBUG` mode in production. + + !!! DANGER + + you should NOT run your applications in `DEBUG` mode in production. \ No newline at end of file diff --git a/docs/tutorials/websocket-source/tutorial.md b/docs/tutorials/websocket-source/tutorial.md index ac8d495ab..c3456017f 100644 --- a/docs/tutorials/websocket-source/tutorial.md +++ b/docs/tutorials/websocket-source/tutorial.md @@ -72,16 +72,22 @@ Now we set up the data retrieval loop contained within a `while self.running` bl This is so a shutdown from the `Application` level also gracefully exits this loop; the `Source` essentially stops if the `Source.run()` method is ever exited. -> [!NOTE] -> Since no other teardown is required for websockets, nothing happens after the -> `while self.running` block. +!!! NOTE + + Since no other teardown is required for websockets, nothing happens after the + `while self.running` block. Inside this block, records are retrieved, serialized (to `JSON`), and produced to an underlying internal topic as close to its raw form as possible (user-level manipulations occur at the `Application` level using a `StreamingDataFrame`). -> [!TIP] -> The internal topic can accept other data serializations by overriding `Source.default_topic()`. +!!! TIP + + The internal topic can accept other data serializations by overriding + `Source.default_topic()`. + + + ## Using `CoinbaseSource` @@ -113,8 +119,10 @@ Create a [Quix Streams Application](../../configuration.md), which is our constr We provide it our connection settings, consumer group (ideally unique per Application), and where the consumer group should start from on the (internal) Source topic. -> [!TIP] -> Once you are more familiar with Kafka, we recommend [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). +!!! TIP + + Once you are more familiar with Kafka, we recommend + [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). #### Our Application ```python @@ -131,8 +139,11 @@ app = Application( Create one for each topic used by your `Application`. -> [!NOTE] -> Any missing topics will be automatically created for you upon running the application. +!!! NOTE + + Any missing topics will be automatically created for you upon running an `Application`. + + #### Our Topics We have one output topic, named `price_updates`: @@ -147,12 +158,16 @@ Now for the fun part: building our [StreamingDataFrame](../../processing.md#intr SDF allows manipulating the message value in a dataframe-like fashion using various operations. -After initializing with either a `Topic` or `Source`, we continue re-assigning to the same `sdf` variable as we add operations. +After initializing with either a `Topic` or `Source`, we continue reassigning to the +same `sdf` variable as we add operations. + +!!! NOTE + + A few `StreamingDataFrame` operations are + ["in-place"](../../advanced/dataframe-assignments.md#valid-in-place-operations), + like `.print()`. + -> [!NOTE] -> A few `StreamingDataFrame` operations are -> ["in-place"](../../advanced/dataframe-assignments.md#valid-in-place-operations), -> like `.print()`. #### Our SDF operations First, we initialize our SDF with our `coinbase_source`. diff --git a/docs/tutorials/word-count/tutorial.md b/docs/tutorials/word-count/tutorial.md index 0581668e7..24ee92777 100644 --- a/docs/tutorials/word-count/tutorial.md +++ b/docs/tutorials/word-count/tutorial.md @@ -30,7 +30,10 @@ the counts of each individually downstream for further processing. We will use a [Quix Streams `Source`](../../connectors/sources/README.md) to generate text to be processed by our new Word Counter `Application`. -NOTE: Our example uses `JSON` formatting for Kafka message values. + +!!! NOTE + + Our example uses `JSON` formatting for Kafka message values. @@ -97,16 +100,22 @@ app = Application( ) ``` -First, create the [Quix Streams Application](../../configuration.md), which is our constructor for everything! We provide it our connection settings, consumer group (ideally unique per Application), and where the consumer group should start from on our topic. +Create a [Quix Streams Application](../../configuration.md), which is our constructor for everything! + +We provide it our connection settings, consumer group (ideally unique per Application), +and where the consumer group should start from on the (internal) `Source` topic. + + +!!! TIP -NOTE: Once you are more familiar with Kafka, we recommend [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). + Once you are more familiar with Kafka, we recommend + [learning more about auto_offset_reset](https://www.quix.io/blog/kafka-auto-offset-reset-use-cases-and-pitfalls). -### Define Topics +### Specify Topics ```python -product_reviews_topic = app.topic(name="product_reviews") word_counts_topic = app.topic(name="product_review_word_counts") ``` @@ -114,7 +123,9 @@ Next we define our input/output topics, named `product_reviews` and `product_rev They each return [`Topic`](../../api-reference/topics.md) objects, used later on. -NOTE: The topics will automatically be created for you in Kafka when you run the application should they not exist. +!!! NOTE + + Any missing topics will be automatically created for you upon running an `Application`. @@ -125,11 +136,18 @@ NOTE: The topics will automatically be created for you in Kafka when you run the sdf = app.dataframe(topic=product_reviews_topic) ``` -Now for the fun part: building our [StreamingDataFrame](../../processing.md#introduction-to-streamingdataframe), often shorthanded to "SDF". +Now for the fun part: building our [StreamingDataFrame](../../processing.md#introduction-to-streamingdataframe), often shorthanded to "SDF". SDF allows manipulating the message value in a dataframe-like fashion using various operations. -After initializing, we continue re-assigning to the same `sdf` variable as we add operations. +After initializing with either a `Topic` or `Source`, we continue reassigning to the +same `sdf` variable as we add operations. + +!!! NOTE + + A few `StreamingDataFrame` operations are + ["in-place"](../../advanced/dataframe-assignments.md#valid-in-place-operations), + like `.print()`. (Also, notice that we pass our input `Topic` from the previous step to it.) @@ -165,11 +183,13 @@ to this: `>>> [('bob', 1), ('likes', 2), ('bananas', 1), ('and', 1), ('frank', 1), ('apples', 1)]` -NOTE: Two VERY important and related points around the `expand=True` argument: +!!! NOTE -1. It tells SDF "hey, this .apply() returns _**multiple independent**_ events!" + Two VERY important and related points around the `expand=True` argument: -2. Our `F` returns a `list` (or a non-dict iterable of some kind), hence the "expand"! + 1. It tells SDF "hey, this .apply() returns _**multiple independent**_ events!" + + 2. Our `F` returns a `list` (or a non-dict iterable of some kind), hence the "expand"! @@ -214,10 +234,18 @@ via [`SDF.to_topic(T)`](../../processing.md#writing-data-to-kafka-topics), where Notice here the optional `key` argument, which allows you to provide a [custom key generator](../../processing.md#changing-message-key-before-producing). While it's fairly common to maintain the input event's key (SDF's default behavior), -there are many reasons why you might adjust it...like here (NOTE: advanced concept below)! +there are also many reasons why you might adjust it, so we showcase an example of that +here! + +!!! QUESTION -We are changing the message key to the word; this data structure enables -calculating _total word counts over time_ from this topic (with a new application, of course!). + What would be the benefit of changing the key to the counted word, in this case? + + This key change would enable calculating _total word counts over time_ from this + topic without additional data transformations (a more advanced operation). + + Even though we won't do that in this example, you can imagine doing so in a + downstream `Application`! In the end we would produce 5 messages in total, like so: @@ -228,7 +256,10 @@ In the end we would produce 5 messages in total, like so: # etc... ``` -NOTE: This is how you would see the values in the Kafka topic `product_review_word_counts`. +!!! NOTE + + This is a user-friendly representation of how a message key/value in the Kafka topic + `product_review_word_counts` would appear. @@ -257,10 +288,16 @@ One thing to keep in mind is that the Quix Streams does not log/print any messag operations by default. To get visual outputs around message processing, you can either: + - use [recommended way of printing/logging with SDF](../../processing.md#debugging) - + - use `DEBUG` mode via `Application(loglevel="DEBUG")` - - WARNING: you should NOT run your applications in `DEBUG` mode in production. + + + !!! DANGER + + you should NOT run your applications in `DEBUG` mode in production. + diff --git a/mkdocs.yml b/mkdocs.yml index 9df6b2671..6d541a96d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,6 +8,8 @@ theme: features: - navigation.path - navigation.indexes + - content.code.copy + - content.code.select markdown_extensions: - attr_list @@ -26,9 +28,10 @@ nav: - Quickstart: 'quickstart.md' - Tutorials: - 'tutorials/README.md' - - Word Count: 'tutorials/word-count/tutorial.md' - Anomaly Detection: 'tutorials/anomaly-detection/tutorial.md' - Purchase Filtering: 'tutorials/purchase-filtering/tutorial.md' + - Word Count: 'tutorials/word-count/tutorial.md' + - Websocket Source: 'tutorials/websocket-source/tutorial.md' - How to: - Produce Data to Kafka: producer.md - Process & Transform Data: processing.md