Skip to content

Commit

Permalink
Implement CSVSource (#490)
Browse files Browse the repository at this point in the history
Implement CSVSource

A base CSV source that reads data from a single CSV file.
  • Loading branch information
quentin-quix authored Sep 9, 2024
1 parent 1240e61 commit c602cb8
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 3 deletions.
2 changes: 1 addition & 1 deletion quixstreams/models/topics/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,4 +313,4 @@ def deserialize(self, message: ConfluentKafkaMessageProto):
)

def __repr__(self):
return f'<{self.__class__.__name__} name="{self._name}"> '
return f'<{self.__class__.__name__} name="{self.name}">'
4 changes: 2 additions & 2 deletions quixstreams/sinks/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def __init__(
See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info.
Default - `"excel"`.
:param key_serializer: a callable to convert keys to strings.
Default - `str()`.
Default - `str`.
:param value_serializer: a callable to convert values to strings.
Default - `json.dumps()`.
Default - `json.dumps`.
"""
super().__init__()
self.path = path
Expand Down
1 change: 1 addition & 0 deletions quixstreams/sources/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .base import *
from .manager import SourceException
from .multiprocessing import multiprocessing
from .csv import CSVSource
82 changes: 82 additions & 0 deletions quixstreams/sources/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import csv
import json

from typing import Optional, Callable, Any

from quixstreams.models.topics import Topic

from .base import Source


class CSVSource(Source):
def __init__(
self,
path: str,
dialect: str = "excel",
name: Optional[str] = None,
shutdown_timeout: float = 10,
key_deserializer: Callable[[Any], str] = str,
value_deserializer: Callable[[Any], str] = json.loads,
) -> None:
"""
A base CSV source that reads data from a single CSV file.
Best used with :class:`quixstreams.sinks.csv.CSVSink`.
Required columns: key, value
Optional columns: timestamp
:param path: path to the CSV file
:param dialect: a CSV dialect to use. It affects quoting and delimiters.
See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info.
Default - `"excel"`.
:param key_deseralizer: a callable to convert strings to key.
Default - `str`
:param value_deserializer: a callable to convert strings to value.
Default - `json.loads`
"""
super().__init__(name or path, shutdown_timeout)
self.path = path
self.dialect = dialect

self._key_deserializer = key_deserializer
self._value_deserializer = value_deserializer

def run(self):
key_deserializer = self._key_deserializer
value_deserializer = self._value_deserializer

with open(self.path, "r") as f:
reader = csv.DictReader(f, dialect=self.dialect)

while self.running:
try:
item = next(reader)
except StopIteration:
return

# if a timestamp column exist with no value timestamp is ""
timestamp = item.get("timestamp") or None
if timestamp is not None:
timestamp = int(timestamp)

msg = self.serialize(
key=key_deserializer(item["key"]),
value=value_deserializer(item["value"]),
timestamp_ms=timestamp,
)

self.produce(
key=msg.key,
value=msg.value,
timestamp=msg.timestamp,
headers=msg.headers,
)

def default_topic(self) -> Topic:
return Topic(
name=self.name,
key_serializer="string",
key_deserializer="string",
value_deserializer="json",
value_serializer="json",
)
5 changes: 5 additions & 0 deletions tests/test_quixstreams/test_models/test_topics/test_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ def __call__(self, value: bytes, ctx: SerializationContext):


class TestTopic:

def test_repr(self, topic_manager_topic_factory):
topic = topic_manager_topic_factory(name="foo")
assert str(topic) == '<Topic name="foo">'

@pytest.mark.parametrize(
"key_deserializer, value_deserializer, key, value, expected_key, expected_value",
[
Expand Down
87 changes: 87 additions & 0 deletions tests/test_quixstreams/test_sources/test_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import csv
import json
import pytest

from unittest.mock import MagicMock

from quixstreams.sources import CSVSource
from quixstreams.rowproducer import RowProducer


class TestCSVSource:

@pytest.fixture
def producer(self):
producer = MagicMock(spec=RowProducer)
producer.flush.return_value = 0
return producer

def test_read(self, tmp_path, producer):
path = tmp_path / "source.csv"
with open(path, "w") as f:
writer = csv.DictWriter(
f, dialect="excel", fieldnames=("key", "value", "timestamp")
)
writer.writeheader()
writer.writerows(
[
{"key": "key1", "value": json.dumps({"value": "value1"})},
{"key": "key2", "value": json.dumps({"value": "value2"})},
{"key": "key3", "value": json.dumps({"value": "value3"})},
{"key": "key4", "value": json.dumps({"value": "value4"})},
{
"key": "key5",
"value": json.dumps({"value": "value5"}),
"timestamp": 10000,
},
]
)

source = CSVSource(path)
source.configure(source.default_topic(), producer)
source.start()

assert producer.produce.called
assert producer.produce.call_count == 5
assert producer.produce.call_args.kwargs == {
"buffer_error_max_tries": 3,
"headers": None,
"key": b"key5",
"partition": None,
"poll_timeout": 5.0,
"timestamp": 10000,
"topic": path,
"value": b'{"value":"value5"}',
}

def test_read_no_timestamp(self, tmp_path, producer):
path = tmp_path / "source.csv"
with open(path, "w") as f:
writer = csv.DictWriter(f, dialect="excel", fieldnames=("key", "value"))
writer.writeheader()
writer.writerows(
[
{"key": "key1", "value": json.dumps({"value": "value1"})},
{"key": "key2", "value": json.dumps({"value": "value2"})},
{"key": "key3", "value": json.dumps({"value": "value3"})},
{"key": "key4", "value": json.dumps({"value": "value4"})},
{"key": "key5", "value": json.dumps({"value": "value5"})},
]
)

source = CSVSource(path)
source.configure(source.default_topic(), producer)
source.start()

assert producer.produce.called
assert producer.produce.call_count == 5
assert producer.produce.call_args.kwargs == {
"buffer_error_max_tries": 3,
"headers": None,
"key": b"key5",
"partition": None,
"poll_timeout": 5.0,
"timestamp": None,
"topic": path,
"value": b'{"value":"value5"}',
}

0 comments on commit c602cb8

Please sign in to comment.