-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement CSVSource A base CSV source that reads data from a single CSV file.
- Loading branch information
1 parent
1240e61
commit c602cb8
Showing
6 changed files
with
178 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .base import * | ||
from .manager import SourceException | ||
from .multiprocessing import multiprocessing | ||
from .csv import CSVSource |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import csv | ||
import json | ||
|
||
from typing import Optional, Callable, Any | ||
|
||
from quixstreams.models.topics import Topic | ||
|
||
from .base import Source | ||
|
||
|
||
class CSVSource(Source): | ||
def __init__( | ||
self, | ||
path: str, | ||
dialect: str = "excel", | ||
name: Optional[str] = None, | ||
shutdown_timeout: float = 10, | ||
key_deserializer: Callable[[Any], str] = str, | ||
value_deserializer: Callable[[Any], str] = json.loads, | ||
) -> None: | ||
""" | ||
A base CSV source that reads data from a single CSV file. | ||
Best used with :class:`quixstreams.sinks.csv.CSVSink`. | ||
Required columns: key, value | ||
Optional columns: timestamp | ||
:param path: path to the CSV file | ||
:param dialect: a CSV dialect to use. It affects quoting and delimiters. | ||
See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info. | ||
Default - `"excel"`. | ||
:param key_deseralizer: a callable to convert strings to key. | ||
Default - `str` | ||
:param value_deserializer: a callable to convert strings to value. | ||
Default - `json.loads` | ||
""" | ||
super().__init__(name or path, shutdown_timeout) | ||
self.path = path | ||
self.dialect = dialect | ||
|
||
self._key_deserializer = key_deserializer | ||
self._value_deserializer = value_deserializer | ||
|
||
def run(self): | ||
key_deserializer = self._key_deserializer | ||
value_deserializer = self._value_deserializer | ||
|
||
with open(self.path, "r") as f: | ||
reader = csv.DictReader(f, dialect=self.dialect) | ||
|
||
while self.running: | ||
try: | ||
item = next(reader) | ||
except StopIteration: | ||
return | ||
|
||
# if a timestamp column exist with no value timestamp is "" | ||
timestamp = item.get("timestamp") or None | ||
if timestamp is not None: | ||
timestamp = int(timestamp) | ||
|
||
msg = self.serialize( | ||
key=key_deserializer(item["key"]), | ||
value=value_deserializer(item["value"]), | ||
timestamp_ms=timestamp, | ||
) | ||
|
||
self.produce( | ||
key=msg.key, | ||
value=msg.value, | ||
timestamp=msg.timestamp, | ||
headers=msg.headers, | ||
) | ||
|
||
def default_topic(self) -> Topic: | ||
return Topic( | ||
name=self.name, | ||
key_serializer="string", | ||
key_deserializer="string", | ||
value_deserializer="json", | ||
value_serializer="json", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import csv | ||
import json | ||
import pytest | ||
|
||
from unittest.mock import MagicMock | ||
|
||
from quixstreams.sources import CSVSource | ||
from quixstreams.rowproducer import RowProducer | ||
|
||
|
||
class TestCSVSource: | ||
|
||
@pytest.fixture | ||
def producer(self): | ||
producer = MagicMock(spec=RowProducer) | ||
producer.flush.return_value = 0 | ||
return producer | ||
|
||
def test_read(self, tmp_path, producer): | ||
path = tmp_path / "source.csv" | ||
with open(path, "w") as f: | ||
writer = csv.DictWriter( | ||
f, dialect="excel", fieldnames=("key", "value", "timestamp") | ||
) | ||
writer.writeheader() | ||
writer.writerows( | ||
[ | ||
{"key": "key1", "value": json.dumps({"value": "value1"})}, | ||
{"key": "key2", "value": json.dumps({"value": "value2"})}, | ||
{"key": "key3", "value": json.dumps({"value": "value3"})}, | ||
{"key": "key4", "value": json.dumps({"value": "value4"})}, | ||
{ | ||
"key": "key5", | ||
"value": json.dumps({"value": "value5"}), | ||
"timestamp": 10000, | ||
}, | ||
] | ||
) | ||
|
||
source = CSVSource(path) | ||
source.configure(source.default_topic(), producer) | ||
source.start() | ||
|
||
assert producer.produce.called | ||
assert producer.produce.call_count == 5 | ||
assert producer.produce.call_args.kwargs == { | ||
"buffer_error_max_tries": 3, | ||
"headers": None, | ||
"key": b"key5", | ||
"partition": None, | ||
"poll_timeout": 5.0, | ||
"timestamp": 10000, | ||
"topic": path, | ||
"value": b'{"value":"value5"}', | ||
} | ||
|
||
def test_read_no_timestamp(self, tmp_path, producer): | ||
path = tmp_path / "source.csv" | ||
with open(path, "w") as f: | ||
writer = csv.DictWriter(f, dialect="excel", fieldnames=("key", "value")) | ||
writer.writeheader() | ||
writer.writerows( | ||
[ | ||
{"key": "key1", "value": json.dumps({"value": "value1"})}, | ||
{"key": "key2", "value": json.dumps({"value": "value2"})}, | ||
{"key": "key3", "value": json.dumps({"value": "value3"})}, | ||
{"key": "key4", "value": json.dumps({"value": "value4"})}, | ||
{"key": "key5", "value": json.dumps({"value": "value5"})}, | ||
] | ||
) | ||
|
||
source = CSVSource(path) | ||
source.configure(source.default_topic(), producer) | ||
source.start() | ||
|
||
assert producer.produce.called | ||
assert producer.produce.call_count == 5 | ||
assert producer.produce.call_args.kwargs == { | ||
"buffer_error_max_tries": 3, | ||
"headers": None, | ||
"key": b"key5", | ||
"partition": None, | ||
"poll_timeout": 5.0, | ||
"timestamp": None, | ||
"topic": path, | ||
"value": b'{"value":"value5"}', | ||
} |