From 07cca1759f70391579f597111e482999d81f7c62 Mon Sep 17 00:00:00 2001 From: Tommy K Date: Sat, 31 Aug 2024 23:37:59 +0200 Subject: [PATCH 01/87] feat(deltalake): Implement Delta Lake backend and add dependencies - Add DeltaLakeCallback class with support for various data types - Implement partitioning, Z-ordering, and time travel features - Add schema documentation for each data type - Include Delta Lake dependencies in setup.py - Create demo file for Delta Lake usage with S3 configuration - Update extras_require in setup.py to include deltalake option --- cryptofeed/backends/deltalake.py | 328 +++++++++++++++++++++++++++++++ examples/demo_deltalake.py | 54 +++++ setup.py | 2 + 3 files changed, 384 insertions(+) create mode 100644 cryptofeed/backends/deltalake.py create mode 100644 examples/demo_deltalake.py diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py new file mode 100644 index 000000000..1fd08e555 --- /dev/null +++ b/cryptofeed/backends/deltalake.py @@ -0,0 +1,328 @@ +''' +Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com + +Please see the LICENSE file for the terms and conditions +associated with this software. +''' +from typing import Optional, List, Dict, Any +import logging +import pandas as pd +from deltalake import DeltaTable, write_deltalake + +from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback +from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK + +LOG = logging.getLogger('feedhandler') + +class DeltaLakeCallback(BackendQueue): + def __init__(self, + base_path: str, + key: Optional[str] = None, + custom_columns: Optional[Dict[str, str]] = None, + partition_cols: Optional[List[str]] = None, + optimize_interval: int = 100, + z_order_cols: Optional[List[str]] = None, + time_travel: bool = False, + storage_options: Optional[Dict[str, Any]] = None, + **kwargs: Any): + super().__init__() + self.key = key or self.default_key + self.base_path = base_path + self.delta_table_path = f"{self.base_path}/{self.key}" + self.custom_columns = custom_columns or {} + self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day'] + self.optimize_interval = optimize_interval + self.z_order_cols = z_order_cols or self._default_z_order_cols() + self.time_travel = time_travel + self.storage_options = storage_options or {} + self.write_count = 0 + self.running = True + + if optimize_interval <= 0: + raise ValueError("optimize_interval must be a positive integer") + + if not isinstance(self.partition_cols, list): + raise TypeError("partition_cols must be a list of strings") + + if not isinstance(self.z_order_cols, list): + raise TypeError("z_order_cols must be a list of strings") + + def _default_z_order_cols(self) -> List[str]: + common_cols = ['exchange', 'symbol', 'timestamp'] + data_specific_cols = { + TRADES: ['price', 'amount'], + FUNDING: ['rate'], + TICKER: ['bid', 'ask'], + OPEN_INTEREST: ['open_interest'], + LIQUIDATIONS: ['quantity', 'price'], + BOOK: [], # Book data is typically queried by timestamp and symbol + CANDLES: ['open', 'close', 'high', 'low'], + ORDER_INFO: ['status', 'price', 'amount'], + TRANSACTIONS: ['type', 'amount'], + BALANCES: ['balance'], + FILLS: ['price', 'amount'] + } + return common_cols + data_specific_cols.get(self.key, []) + + async def writer(self): + while self.running: + async with self.read_queue() as updates: + if updates: + df = pd.DataFrame(updates) + df['date'] = pd.to_datetime(df['timestamp'], unit='s') + df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s') + df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day + + # Reorder columns to put exchange and symbol first + cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']] + df = df[cols] + + if self.custom_columns: + df = df.rename(columns=self.custom_columns) + + await self._write_batch(df) + + async def _write_batch(self, df: pd.DataFrame): + if df.empty: + return + + try: + LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") + write_deltalake( + self.delta_table_path, + df, + mode="append", + partition_by=self.partition_cols, + schema_mode="merge", + storage_options=self.storage_options + ) + self.write_count += 1 + + if self.write_count % self.optimize_interval == 0: + await self._optimize_table() + + if self.time_travel: + self._update_metadata() + + except Exception as e: + LOG.error(f"Error writing to Delta Lake: {e}") + + async def _optimize_table(self): + LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") + dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) + dt.optimize.compact() + if self.z_order_cols: + dt.optimize.z_order(self.z_order_cols) + + def _update_metadata(self): + dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) + LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}") + + async def stop(self): + self.running = False + + def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: + if self.time_travel: + dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) + if timestamp: + return dt.version_at_timestamp(timestamp) + else: + return dt.version() + else: + LOG.warning("Time travel is not enabled for this table") + return None + +class TradeDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = TRADES + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - id: string (nullable) + - side: string + - amount: float64 + - price: float64 + - type: string (nullable) + """ + +class FundingDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = FUNDING + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - mark_price: float64 (nullable) + - rate: float64 + - next_funding_time: datetime64[ns] (nullable) + - predicted_rate: float64 (nullable) + """ + +class TickerDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = TICKER + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - bid: float64 + - ask: float64 + """ + +class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = OPEN_INTEREST + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - open_interest: float64 + """ + +class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = LIQUIDATIONS + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - side: string + - quantity: float64 + - price: float64 + - id: string + - status: string + """ + +class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): + default_key = BOOK + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - delta: dict (nullable, contains 'bid' and 'ask' updates) + - book: dict (contains full order book snapshot when available) + """ + +class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = CANDLES + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - start: datetime64[ns] + - stop: datetime64[ns] + - interval: string + - trades: int64 (nullable) + - open: float64 + - close: float64 + - high: float64 + - low: float64 + - volume: float64 + - closed: bool (nullable) + """ + +class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = ORDER_INFO + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - id: string + - client_order_id: string (nullable) + - side: string + - status: string + - type: string + - price: float64 + - amount: float64 + - remaining: float64 (nullable) + - account: string (nullable) + """ + +class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = TRANSACTIONS + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - currency: string + - type: string + - status: string + - amount: float64 + """ + +class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = BALANCES + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - currency: string + - balance: float64 + - reserved: float64 (nullable) + """ + +class FillsDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = FILLS + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - price: float64 + - amount: float64 + - side: string + - fee: float64 (nullable) + - id: string + - order_id: string + - liquidity: string + - type: string + - account: string (nullable) + """ \ No newline at end of file diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py new file mode 100644 index 000000000..5c973016e --- /dev/null +++ b/examples/demo_deltalake.py @@ -0,0 +1,54 @@ +''' +Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com + +Please see the LICENSE file for the terms and conditions +associated with this software. +''' +from cryptofeed import FeedHandler +from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake +from cryptofeed.defines import FUNDING, TICKER, TRADES +from cryptofeed.exchanges import Binance + + +def main(): + f = FeedHandler() + + # Define the Delta Lake base path (can be local or S3) + delta_base_path = 's3://your-bucket/path/to/delta/tables' + + # S3 storage options (remove if using local storage) + s3_options = { + "AWS_ACCESS_KEY_ID": "your_access_key", + "AWS_SECRET_ACCESS_KEY": "your_secret_key", + "AWS_REGION": "your_region" + } + + # Add Binance feed with Delta Lake callbacks + f.add_feed(Binance( + channels=[TRADES, FUNDING, TICKER], + symbols=['BTC-USDT', 'ETH-USDT'], + callbacks={ + TRADES: TradeDeltaLake( + base_path=delta_base_path, + optimize_interval=50, # More frequent table optimization + time_travel=True, # Enable time travel feature + storage_options=s3_options # Add S3 configuration + ), + FUNDING: FundingDeltaLake( + base_path=delta_base_path, + storage_options=s3_options # Add S3 configuration + ), + TICKER: TickerDeltaLake( + base_path=delta_base_path, + partition_cols=['exchange', 'symbol', 'year', 'month', 'day'], # Custom partitioning + z_order_cols=['timestamp', 'bid', 'ask'], # Enable Z-ordering + storage_options=s3_options # Add S3 configuration + ) + } + )) + + f.run() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/setup.py b/setup.py index adf10f870..344573348 100644 --- a/setup.py +++ b/setup.py @@ -95,6 +95,7 @@ def run_tests(self): "rabbit": ["aio_pika", "pika"], "redis": ["hiredis", "redis>=4.5.1"], "zmq": ["pyzmq"], + "deltalake": ["deltalake>=0.6.1", "pandas"], "all": [ "arctic", "google_cloud_pubsub>=2.4.1", @@ -107,6 +108,7 @@ def run_tests(self): "hiredis", "redis>=4.5.1", "pyzmq", + "deltalake>=0.6.1", ], }, ) From c003c69b76a44fc14ea63b3036279af096bcbb60 Mon Sep 17 00:00:00 2001 From: Tommy K Date: Sun, 1 Sep 2024 00:11:55 +0200 Subject: [PATCH 02/87] feat(deltalake): optimize Delta Lake implementation --- cryptofeed/backends/deltalake.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 1fd08e555..bd29037ae 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -5,7 +5,9 @@ associated with this software. ''' from typing import Optional, List, Dict, Any +from collections import defaultdict import logging + import pandas as pd from deltalake import DeltaTable, write_deltalake @@ -228,6 +230,11 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): - delta: dict (nullable, contains 'bid' and 'ask' updates) - book: dict (contains full order book snapshot when available) """ + def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs): + self.snapshots_only = snapshots_only + self.snapshot_interval = snapshot_interval + self.snapshot_count = defaultdict(int) + super().__init__(*args, **kwargs) class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES From f8256dc2b64544119cc207fb043512a6085c4ca2 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 00:51:23 +0200 Subject: [PATCH 03/87] fix(deltalake): fix book table name --- cryptofeed/backends/deltalake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index bd29037ae..c55475f2c 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -12,7 +12,7 @@ from deltalake import DeltaTable, write_deltalake from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback -from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK +from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS LOG = logging.getLogger('feedhandler') @@ -217,7 +217,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): """ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): - default_key = BOOK + default_key = "book" """ Schema: - timestamp: datetime64[ns] (from 'date' column) From cf2c92628725a227d19f993d57ca0c6afcd6bdb9 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 01:01:48 +0200 Subject: [PATCH 04/87] fix(deltalake): Fix book name --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c55475f2c..c1b882c34 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -57,7 +57,7 @@ def _default_z_order_cols(self) -> List[str]: TICKER: ['bid', 'ask'], OPEN_INTEREST: ['open_interest'], LIQUIDATIONS: ['quantity', 'price'], - BOOK: [], # Book data is typically queried by timestamp and symbol + "book": [], # Book data is typically queried by timestamp and symbol CANDLES: ['open', 'close', 'high', 'low'], ORDER_INFO: ['status', 'price', 'amount'], TRANSACTIONS: ['type', 'amount'], From b02ab52443cd04b4790f1f1716a4575ac0cb8138 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 01:21:33 +0200 Subject: [PATCH 05/87] fix(deltalake): Fix numeric type --- cryptofeed/backends/deltalake.py | 35 ++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c1b882c34..fda44563e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -4,9 +4,10 @@ Please see the LICENSE file for the terms and conditions associated with this software. ''' -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Union from collections import defaultdict import logging +import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake @@ -16,16 +17,19 @@ LOG = logging.getLogger('feedhandler') + class DeltaLakeCallback(BackendQueue): - def __init__(self, - base_path: str, - key: Optional[str] = None, + def __init__(self, + base_path: str, + key: Optional[str] = None, custom_columns: Optional[Dict[str, str]] = None, partition_cols: Optional[List[str]] = None, optimize_interval: int = 100, z_order_cols: Optional[List[str]] = None, time_travel: bool = False, storage_options: Optional[Dict[str, Any]] = None, + numeric_type: Union[type, str] = float, + none_to: Any = None, **kwargs: Any): super().__init__() self.key = key or self.default_key @@ -49,6 +53,9 @@ def __init__(self, if not isinstance(self.z_order_cols, list): raise TypeError("z_order_cols must be a list of strings") + self.numeric_type = numeric_type + self.none_to = none_to + def _default_z_order_cols(self) -> List[str]: common_cols = ['exchange', 'symbol', 'timestamp'] data_specific_cols = { @@ -89,6 +96,15 @@ async def _write_batch(self, df: pd.DataFrame): return try: + # Convert numeric columns to the specified numeric type + numeric_columns = df.select_dtypes(include=[np.number]).columns + for col in numeric_columns: + df[col] = df[col].astype(self.numeric_type) + + # Replace None values with the specified value + if self.none_to is not None: + df = df.fillna(self.none_to) + LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") write_deltalake( self.delta_table_path, @@ -134,6 +150,7 @@ def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: LOG.warning("Time travel is not enabled for this table") return None + class TradeDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRADES """ @@ -152,6 +169,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): - type: string (nullable) """ + class FundingDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FUNDING """ @@ -169,6 +187,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): - predicted_rate: float64 (nullable) """ + class TickerDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TICKER """ @@ -184,6 +203,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): - ask: float64 """ + class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): default_key = OPEN_INTEREST """ @@ -198,6 +218,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): - open_interest: float64 """ + class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = LIQUIDATIONS """ @@ -216,6 +237,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): - status: string """ + class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): default_key = "book" """ @@ -236,6 +258,7 @@ def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs self.snapshot_count = defaultdict(int) super().__init__(*args, **kwargs) + class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES """ @@ -259,6 +282,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): - closed: bool (nullable) """ + class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): default_key = ORDER_INFO """ @@ -281,6 +305,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): - account: string (nullable) """ + class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRANSACTIONS """ @@ -297,6 +322,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): - amount: float64 """ + class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = BALANCES """ @@ -312,6 +338,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): - reserved: float64 (nullable) """ + class FillsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FILLS """ From 42eea5544115e0b1bc3132f67cf9d31ccc3a09ea Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 17:26:24 +0200 Subject: [PATCH 06/87] fix: Ensure timestamp columns have nanosecond precision in DeltaLake backend --- .gitignore | 1 + cryptofeed/backends/deltalake.py | 109 +++++++++++++++++++------------ 2 files changed, 68 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index 5860625f7..ac64f2b9e 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,4 @@ ENV/ # PyCharm .idea/ +.aider* diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index fda44563e..963488b29 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -1,49 +1,60 @@ -''' +""" Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com Please see the LICENSE file for the terms and conditions associated with this software. -''' -from typing import Optional, List, Dict, Any, Union -from collections import defaultdict +""" + import logging -import numpy as np +from collections import defaultdict +from typing import Any, Dict, List, Optional, Union +import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback -from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) + -LOG = logging.getLogger('feedhandler') +LOG = logging.getLogger("feedhandler") class DeltaLakeCallback(BackendQueue): - def __init__(self, - base_path: str, - key: Optional[str] = None, - custom_columns: Optional[Dict[str, str]] = None, - partition_cols: Optional[List[str]] = None, - optimize_interval: int = 100, - z_order_cols: Optional[List[str]] = None, - time_travel: bool = False, - storage_options: Optional[Dict[str, Any]] = None, - numeric_type: Union[type, str] = float, - none_to: Any = None, - **kwargs: Any): + def __init__( + self, + base_path: str, + key: Optional[str] = None, + custom_columns: Optional[Dict[str, str]] = None, + partition_cols: Optional[List[str]] = None, + optimize_interval: int = 100, + z_order_cols: Optional[List[str]] = None, + time_travel: bool = False, + storage_options: Optional[Dict[str, Any]] = None, + numeric_type: Union[type, str] = float, + none_to: Any = None, + **kwargs: Any, + ): super().__init__() self.key = key or self.default_key self.base_path = base_path self.delta_table_path = f"{self.base_path}/{self.key}" self.custom_columns = custom_columns or {} - self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day'] + self.partition_cols = partition_cols or [ + "exchange", + "symbol", + "year", + "month", + "day", + ] self.optimize_interval = optimize_interval self.z_order_cols = z_order_cols or self._default_z_order_cols() self.time_travel = time_travel self.storage_options = storage_options or {} self.write_count = 0 self.running = True - + if optimize_interval <= 0: raise ValueError("optimize_interval must be a positive integer") @@ -57,19 +68,19 @@ def __init__(self, self.none_to = none_to def _default_z_order_cols(self) -> List[str]: - common_cols = ['exchange', 'symbol', 'timestamp'] + common_cols = ["exchange", "symbol", "timestamp"] data_specific_cols = { - TRADES: ['price', 'amount'], - FUNDING: ['rate'], - TICKER: ['bid', 'ask'], - OPEN_INTEREST: ['open_interest'], - LIQUIDATIONS: ['quantity', 'price'], + TRADES: ["price", "amount"], + FUNDING: ["rate"], + TICKER: ["bid", "ask"], + OPEN_INTEREST: ["open_interest"], + LIQUIDATIONS: ["quantity", "price"], "book": [], # Book data is typically queried by timestamp and symbol - CANDLES: ['open', 'close', 'high', 'low'], - ORDER_INFO: ['status', 'price', 'amount'], - TRANSACTIONS: ['type', 'amount'], - BALANCES: ['balance'], - FILLS: ['price', 'amount'] + CANDLES: ["open", "close", "high", "low"], + ORDER_INFO: ["status", "price", "amount"], + TRANSACTIONS: ["type", "amount"], + BALANCES: ["balance"], + FILLS: ["price", "amount"], } return common_cols + data_specific_cols.get(self.key, []) @@ -78,17 +89,25 @@ async def writer(self): async with self.read_queue() as updates: if updates: df = pd.DataFrame(updates) - df['date'] = pd.to_datetime(df['timestamp'], unit='s') - df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s') - df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day - + df["date"] = pd.to_datetime(df["timestamp"], unit="s") + df["receipt_timestamp"] = pd.to_datetime( + df["receipt_timestamp"], unit="s" + ) + df["year"], df["month"], df["day"] = ( + df["date"].dt.year, + df["date"].dt.month, + df["date"].dt.day, + ) + # Reorder columns to put exchange and symbol first - cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']] + cols = ["exchange", "symbol"] + [ + col for col in df.columns if col not in ["exchange", "symbol"] + ] df = df[cols] - + if self.custom_columns: df = df.rename(columns=self.custom_columns) - + await self._write_batch(df) async def _write_batch(self, df: pd.DataFrame): @@ -96,6 +115,11 @@ async def _write_batch(self, df: pd.DataFrame): return try: + # Ensure timestamp columns are in nanosecond precision + timestamp_columns = df.select_dtypes(include=["datetime64"]).columns + for col in timestamp_columns: + df[col] = df[col].astype("datetime64[ns]") + # Convert numeric columns to the specified numeric type numeric_columns = df.select_dtypes(include=[np.number]).columns for col in numeric_columns: @@ -112,7 +136,7 @@ async def _write_batch(self, df: pd.DataFrame): mode="append", partition_by=self.partition_cols, schema_mode="merge", - storage_options=self.storage_options + storage_options=self.storage_options, ) self.write_count += 1 @@ -252,6 +276,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): - delta: dict (nullable, contains 'bid' and 'ask' updates) - book: dict (contains full order book snapshot when available) """ + def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs): self.snapshots_only = snapshots_only self.snapshot_interval = snapshot_interval @@ -359,4 +384,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): - liquidity: string - type: string - account: string (nullable) - """ \ No newline at end of file + """ From 090c4f4b9f8b114ac2beaa00d272dbab099ac73e Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 17:47:21 +0200 Subject: [PATCH 07/87] feat: Refactor timestamp column handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 963488b29..fa6050edb 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -13,9 +13,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -115,10 +129,10 @@ async def _write_batch(self, df: pd.DataFrame): return try: - # Ensure timestamp columns are in nanosecond precision + # Convert timestamp columns from ns to us timestamp_columns = df.select_dtypes(include=["datetime64"]).columns for col in timestamp_columns: - df[col] = df[col].astype("datetime64[ns]") + df[col] = df[col].astype("datetime64[us]") # Convert numeric columns to the specified numeric type numeric_columns = df.select_dtypes(include=[np.number]).columns From 2a60f20188e6850348b9e64f6e32806614607446 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 18:25:31 +0200 Subject: [PATCH 08/87] fix: Handle null values in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index fa6050edb..dca2dd07d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -82,7 +82,7 @@ def __init__( self.none_to = none_to def _default_z_order_cols(self) -> List[str]: - common_cols = ["exchange", "symbol", "timestamp"] + common_cols = ["timestamp"] data_specific_cols = { TRADES: ["price", "amount"], FUNDING: ["rate"], @@ -96,7 +96,9 @@ def _default_z_order_cols(self) -> List[str]: BALANCES: ["balance"], FILLS: ["price", "amount"], } - return common_cols + data_specific_cols.get(self.key, []) + z_order_cols = common_cols + data_specific_cols.get(self.key, []) + # Remove any columns that are already in partition_cols + return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): while self.running: @@ -139,9 +141,20 @@ async def _write_batch(self, df: pd.DataFrame): for col in numeric_columns: df[col] = df[col].astype(self.numeric_type) - # Replace None values with the specified value + # Handle null values if self.none_to is not None: df = df.fillna(self.none_to) + else: + # Replace None with appropriate default values based on column type + for col in df.columns: + if df[col].dtype == 'object': + df[col] = df[col].fillna('') # Replace None with empty string for object columns + elif df[col].dtype in ['float64', 'int64']: + df[col] = df[col].fillna(0) # Replace None with 0 for numeric columns + elif df[col].dtype == 'bool': + df[col] = df[col].fillna(False) # Replace None with False for boolean columns + elif df[col].dtype == 'datetime64[us]': + df[col] = df[col].fillna(pd.Timestamp.min) # Replace None with minimum timestamp for datetime columns LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") write_deltalake( From 2f018152ecf34f7810be18f5a6cb58d5beaeef5a Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:51:21 +0200 Subject: [PATCH 09/87] feat: Implement DeltaLake backend for Cryptofeed --- cryptofeed/backends/deltalake.py | 357 +++++++++++++++++-------------- 1 file changed, 200 insertions(+), 157 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index dca2dd07d..d75277e1b 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -5,6 +5,7 @@ associated with this software. """ +import asyncio import logging from collections import defaultdict from typing import Any, Dict, List, Optional, Union @@ -13,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -55,13 +42,7 @@ def __init__( self.base_path = base_path self.delta_table_path = f"{self.base_path}/{self.key}" self.custom_columns = custom_columns or {} - self.partition_cols = partition_cols or [ - "exchange", - "symbol", - "year", - "month", - "day", - ] + self.partition_cols = partition_cols or ["exchange", "symbol", "dt"] self.optimize_interval = optimize_interval self.z_order_cols = z_order_cols or self._default_z_order_cols() self.time_travel = time_travel @@ -69,17 +50,31 @@ def __init__( self.write_count = 0 self.running = True - if optimize_interval <= 0: + # Validate configuration parameters + self._validate_configuration() + + self.numeric_type = numeric_type + self.none_to = none_to + + def _validate_configuration(self): + if self.optimize_interval <= 0: raise ValueError("optimize_interval must be a positive integer") - if not isinstance(self.partition_cols, list): + if not isinstance(self.partition_cols, list) or not all( + isinstance(col, str) for col in self.partition_cols + ): raise TypeError("partition_cols must be a list of strings") - if not isinstance(self.z_order_cols, list): + if not isinstance(self.z_order_cols, list) or not all( + isinstance(col, str) for col in self.z_order_cols + ): raise TypeError("z_order_cols must be a list of strings") - self.numeric_type = numeric_type - self.none_to = none_to + if not isinstance(self.storage_options, dict): + raise TypeError("storage_options must be a dictionary") + + if not isinstance(self.numeric_type, (type, str)): + raise TypeError("numeric_type must be a type or a string") def _default_z_order_cols(self) -> List[str]: common_cols = ["timestamp"] @@ -104,17 +99,9 @@ async def writer(self): while self.running: async with self.read_queue() as updates: if updates: + LOG.info(f"Received {len(updates)} updates for processing.") df = pd.DataFrame(updates) - df["date"] = pd.to_datetime(df["timestamp"], unit="s") - df["receipt_timestamp"] = pd.to_datetime( - df["receipt_timestamp"], unit="s" - ) - df["year"], df["month"], df["day"] = ( - df["date"].dt.year, - df["date"].dt.month, - df["date"].dt.day, - ) - + self._convert_fields(df) # Reorder columns to put exchange and symbol first cols = ["exchange", "symbol"] + [ col for col in df.columns if col not in ["exchange", "symbol"] @@ -126,55 +113,126 @@ async def writer(self): await self._write_batch(df) + def _convert_fields(self, df: pd.DataFrame): + LOG.debug("Converting fields in DataFrame.") + self._convert_datetime_fields(df) + self._convert_category_fields(df) + self._convert_int_fields(df) + + def _convert_datetime_fields(self, df: pd.DataFrame): + LOG.debug("Converting datetime fields.") + datetime_columns = ["timestamp", "receipt_timestamp"] + for col in datetime_columns: + if col in df.columns: + df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]") + if "timestamp" in df.columns: + df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + + def _convert_category_fields(self, df: pd.DataFrame): + LOG.debug("Converting category fields.") + category_columns = [ + "exchange", + "symbol", + "side", + "type", + "status", + "currency", + "liquidity", + ] + for col in category_columns: + if col in df.columns: + df[col] = df[col].astype("category") + + def _convert_int_fields(self, df: pd.DataFrame): + LOG.debug("Converting integer fields.") + int_columns = ["id", "trade_id", "trades"] + for col in int_columns: + if col in df.columns: + df[col] = df[col].astype("int64") + async def _write_batch(self, df: pd.DataFrame): if df.empty: + LOG.warning("DataFrame is empty. Skipping write operation.") return - try: - # Convert timestamp columns from ns to us - timestamp_columns = df.select_dtypes(include=["datetime64"]).columns - for col in timestamp_columns: - df[col] = df[col].astype("datetime64[us]") - - # Convert numeric columns to the specified numeric type - numeric_columns = df.select_dtypes(include=[np.number]).columns - for col in numeric_columns: - df[col] = df[col].astype(self.numeric_type) + max_retries = 3 + retry_delay = 5 # seconds + + for attempt in range(max_retries): + try: + LOG.info( + f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." + ) + # Convert timestamp columns to datetime64[ns] + timestamp_columns = df.select_dtypes(include=["datetime64"]).columns + for col in timestamp_columns: + df[col] = df[col].astype("datetime64[ns]") + + # Convert numeric columns to the specified numeric type + numeric_columns = df.select_dtypes(include=[np.number]).columns + for col in numeric_columns: + df[col] = df[col].astype(self.numeric_type) + + # Handle null values + df = self._handle_null_values(df) + + LOG.info( + f"Writing batch of {len(df)} records to {self.delta_table_path}" + ) + write_deltalake( + self.delta_table_path, + df, + mode="append", + partition_by=self.partition_cols, + schema_mode="merge", + storage_options=self.storage_options, + ) + self.write_count += 1 + + if self.write_count % self.optimize_interval == 0: + await self._optimize_table() + + if self.time_travel: + self._update_metadata() + + LOG.info("Batch write successful.") + break # Exit the retry loop if write is successful + + except Exception as e: + LOG.error( + f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" + ) + if attempt < max_retries - 1: + LOG.info(f"Retrying in {retry_delay} seconds...") + await asyncio.sleep(retry_delay) + else: + LOG.error( + "Max retries reached. Failed to write batch to Delta Lake." + ) - # Handle null values - if self.none_to is not None: - df = df.fillna(self.none_to) - else: - # Replace None with appropriate default values based on column type - for col in df.columns: - if df[col].dtype == 'object': - df[col] = df[col].fillna('') # Replace None with empty string for object columns - elif df[col].dtype in ['float64', 'int64']: - df[col] = df[col].fillna(0) # Replace None with 0 for numeric columns - elif df[col].dtype == 'bool': - df[col] = df[col].fillna(False) # Replace None with False for boolean columns - elif df[col].dtype == 'datetime64[us]': - df[col] = df[col].fillna(pd.Timestamp.min) # Replace None with minimum timestamp for datetime columns - - LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") - write_deltalake( - self.delta_table_path, - df, - mode="append", - partition_by=self.partition_cols, - schema_mode="merge", - storage_options=self.storage_options, - ) - self.write_count += 1 - - if self.write_count % self.optimize_interval == 0: - await self._optimize_table() - - if self.time_travel: - self._update_metadata() - - except Exception as e: - LOG.error(f"Error writing to Delta Lake: {e}") + def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: + if self.none_to is not None: + return df.fillna(self.none_to) + else: + # Replace None with appropriate default values based on column type + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].fillna( + "" + ) # Replace None with empty string for object columns + elif df[col].dtype in ["float64", "int64"]: + df[col] = df[col].fillna( + 0 + ) # Replace None with 0 for numeric columns + elif df[col].dtype == "bool": + df[col] = df[col].fillna( + False + ) # Replace None with False for boolean columns + elif df[col].dtype == "datetime64[ns]": + df[col] = df[col].fillna( + pd.Timestamp.min + ) # Replace None with minimum timestamp for datetime columns + return df async def _optimize_table(self): LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") @@ -182,21 +240,27 @@ async def _optimize_table(self): dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) + LOG.info("OPTIMIZE operation completed.") def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}") async def stop(self): + LOG.info("Stopping DeltaLakeCallback writer.") self.running = False def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: if self.time_travel: dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) if timestamp: - return dt.version_at_timestamp(timestamp) + version = dt.version_at_timestamp(timestamp) + LOG.info(f"Retrieved version {version} for timestamp {timestamp}.") + return version else: - return dt.version() + version = dt.version() + LOG.info(f"Retrieved current version {version}.") + return version else: LOG.warning("Time travel is not enabled for this table") return None @@ -208,16 +272,15 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string - - id: string (nullable) - - side: string + - dt: string + - exchange: category + - symbol: category + - id: int64 (nullable) + - side: category - amount: float64 - price: float64 - - type: string (nullable) + - type: category (nullable) + - trade_id: int64 """ @@ -227,11 +290,9 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - mark_price: float64 (nullable) - rate: float64 - next_funding_time: datetime64[ns] (nullable) @@ -245,11 +306,9 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - bid: float64 - ask: float64 """ @@ -261,11 +320,9 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - open_interest: float64 """ @@ -276,16 +333,14 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string - - side: string + - dt: string + - exchange: category + - symbol: category + - side: category - quantity: float64 - price: float64 - - id: string - - status: string + - id: int64 + - status: category """ @@ -295,11 +350,9 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - delta: dict (nullable, contains 'bid' and 'ask' updates) - book: dict (contains full order book snapshot when available) """ @@ -317,11 +370,9 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - start: datetime64[ns] - stop: datetime64[ns] - interval: string @@ -341,16 +392,14 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string - - id: string + - dt: string + - exchange: category + - symbol: category + - id: int64 - client_order_id: string (nullable) - - side: string - - status: string - - type: string + - side: category + - status: category + - type: category - price: float64 - amount: float64 - remaining: float64 (nullable) @@ -364,13 +413,11 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - currency: string - - type: string - - status: string + - dt: string + - exchange: category + - currency: category + - type: category + - status: category - amount: float64 """ @@ -381,11 +428,9 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - currency: string + - dt: string + - exchange: category + - currency: category - balance: float64 - reserved: float64 (nullable) """ @@ -397,18 +442,16 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - price: float64 - amount: float64 - - side: string + - side: category - fee: float64 (nullable) - - id: string - - order_id: string - - liquidity: string - - type: string + - id: int64 + - order_id: int64 + - liquidity: category + - type: category - account: string (nullable) """ From 2081b11d6162bd596176822e3216e85f26d5ace2 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:57:16 +0200 Subject: [PATCH 10/87] fix: Refactor DeltaLakeCallback class --- cryptofeed/backends/deltalake.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d75277e1b..6e158f364 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -49,12 +63,10 @@ def __init__( self.storage_options = storage_options or {} self.write_count = 0 self.running = True - - # Validate configuration parameters - self._validate_configuration() - self.numeric_type = numeric_type self.none_to = none_to + # Validate configuration parameters + self._validate_configuration() def _validate_configuration(self): if self.optimize_interval <= 0: From a9ba6b224cb9333535e9693c2c3cd63fb3447a91 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 13:13:37 +0200 Subject: [PATCH 11/87] fix: Add debug logging for DataFrame schema in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 6e158f364..7991f84cb 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -175,6 +175,9 @@ async def _write_batch(self, df: pd.DataFrame): LOG.info( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." ) + # Debug output the schema of the DataFrame + LOG.debug(f"DataFrame schema:\n{df.dtypes}") + # Convert timestamp columns to datetime64[ns] timestamp_columns = df.select_dtypes(include=["datetime64"]).columns for col in timestamp_columns: @@ -191,6 +194,9 @@ async def _write_batch(self, df: pd.DataFrame): LOG.info( f"Writing batch of {len(df)} records to {self.delta_table_path}" ) + # Debug output the schema of the DataFrame + LOG.debug(f"DataFrame schema before write:\n{df.dtypes}") + write_deltalake( self.delta_table_path, df, From 4ab9fd87939a8e8a0c0a4d30d914018977abf539 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 13:20:07 +0200 Subject: [PATCH 12/87] fix: Add DataFrame schema logging when timestamp-related error occurs during Delta Lake write --- cryptofeed/backends/deltalake.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 7991f84cb..ffb47db7d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -217,9 +217,13 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: + # When error is related to timestamp, print the schema of the DataFrame + if "timestamp" in str(e): + LOG.error(f"DataFrame schema:\n{df.dtypes}") LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" ) + if attempt < max_retries - 1: LOG.info(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) From 797a789d5a9b5a905e48e8d632546cc292a402b0 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:36:58 +0200 Subject: [PATCH 13/87] fix: convert timestamp columns to datetime64[ms] --- cryptofeed/backends/deltalake.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index ffb47db7d..57084ec1a 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -136,7 +136,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame): datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]") + df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") if "timestamp" in df.columns: df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") @@ -178,10 +178,10 @@ async def _write_batch(self, df: pd.DataFrame): # Debug output the schema of the DataFrame LOG.debug(f"DataFrame schema:\n{df.dtypes}") - # Convert timestamp columns to datetime64[ns] + # Convert timestamp columns to datetime64[ms] timestamp_columns = df.select_dtypes(include=["datetime64"]).columns for col in timestamp_columns: - df[col] = df[col].astype("datetime64[ns]") + df[col] = df[col].astype("datetime64[ms]") # Convert numeric columns to the specified numeric type numeric_columns = df.select_dtypes(include=[np.number]).columns From ceb8f76f54117c5e73c5fb688a90fef2b1a83dac Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:59:19 +0200 Subject: [PATCH 14/87] fix: Ensure all partition columns are present in the DataFrame --- cryptofeed/backends/deltalake.py | 36 +++++++++++++++----------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 57084ec1a..32e9a902a 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -167,6 +153,18 @@ async def _write_batch(self, df: pd.DataFrame): LOG.warning("DataFrame is empty. Skipping write operation.") return + # Ensure all partition columns are present in the DataFrame + for col in self.partition_cols: + if col not in df.columns: + if col == "exchange" or col == "symbol": + df[col] = "" # Default to empty string for categorical columns + elif col == "dt": + df[col] = pd.Timestamp.min.strftime( + "%Y-%m-%d" + ) # Default to min date for date columns + else: + df[col] = 0 # Default to 0 for numeric columns + max_retries = 3 retry_delay = 5 # seconds @@ -218,8 +216,8 @@ async def _write_batch(self, df: pd.DataFrame): except Exception as e: # When error is related to timestamp, print the schema of the DataFrame - if "timestamp" in str(e): - LOG.error(f"DataFrame schema:\n{df.dtypes}") + LOG.error(f"DataFrame schema:\n{df.dtypes}") + LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" ) From 84c436d73b0685aeb7aeb04b18195ad689ac7844 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:00:25 +0200 Subject: [PATCH 15/87] fix: convert timestamp column to datetime64[ms] format --- cryptofeed/backends/deltalake.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 32e9a902a..cb9f98328 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -124,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame): if col in df.columns: df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") if "timestamp" in df.columns: - df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + df["dt"] = df["timestamp"].dt.date.astype("string") def _convert_category_fields(self, df: pd.DataFrame): LOG.debug("Converting category fields.") @@ -248,9 +262,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: df[col] = df[col].fillna( False ) # Replace None with False for boolean columns - elif df[col].dtype == "datetime64[ns]": + elif df[col].dtype == "datetime64[ms]": df[col] = df[col].fillna( - pd.Timestamp.min + pd.Timestamp.min.astype("datetime64[ms]") ) # Replace None with minimum timestamp for datetime columns return df From 84fd533c13915a3dc83d6070fa1fb73fd63e76dd Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:11:45 +0200 Subject: [PATCH 16/87] feat: Convert timestamp column to date string format in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index cb9f98328..d05ebb06d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -138,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame): if col in df.columns: df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") if "timestamp" in df.columns: - df["dt"] = df["timestamp"].dt.date.astype("string") + df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") def _convert_category_fields(self, df: pd.DataFrame): LOG.debug("Converting category fields.") From 5e4c4dfcb4ef6fc83324225c4818d159e89f0027 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:32:15 +0200 Subject: [PATCH 17/87] refactor: Simplify null value handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d05ebb06d..e38a755c6 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -248,25 +248,18 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: if self.none_to is not None: return df.fillna(self.none_to) else: - # Replace None with appropriate default values based on column type for col in df.columns: - if df[col].dtype == "object": - df[col] = df[col].fillna( - "" - ) # Replace None with empty string for object columns - elif df[col].dtype in ["float64", "int64"]: - df[col] = df[col].fillna( - 0 - ) # Replace None with 0 for numeric columns - elif df[col].dtype == "bool": - df[col] = df[col].fillna( - False - ) # Replace None with False for boolean columns - elif df[col].dtype == "datetime64[ms]": - df[col] = df[col].fillna( - pd.Timestamp.min.astype("datetime64[ms]") - ) # Replace None with minimum timestamp for datetime columns - return df + if pd.api.types.is_string_dtype( + df[col] + ) or pd.api.types.is_categorical_dtype(df[col]): + df[col] = df[col].fillna("") + elif pd.api.types.is_numeric_dtype(df[col]): + df[col] = df[col].fillna(0) + elif pd.api.types.is_bool_dtype(df[col]): + df[col] = df[col].fillna(False) + elif pd.api.types.is_datetime64_any_dtype(df[col]): + df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]") + return df async def _optimize_table(self): LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") From 5bdd670bdb972343316d85bc776dbe1b3cdb09a9 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:42:41 +0200 Subject: [PATCH 18/87] fix: Ensure empty string is a category in categorical columns and handle null values correctly --- cryptofeed/backends/deltalake.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index e38a755c6..c15c85c10 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -153,7 +153,11 @@ def _convert_category_fields(self, df: pd.DataFrame): ] for col in category_columns: if col in df.columns: - df[col] = df[col].astype("category") + # Add empty string as a category if it's not already present + categories = df[col].unique().tolist() + if '' not in categories: + categories.append('') + df[col] = pd.Categorical(df[col], categories=categories) def _convert_int_fields(self, df: pd.DataFrame): LOG.debug("Converting integer fields.") @@ -249,9 +253,12 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: return df.fillna(self.none_to) else: for col in df.columns: - if pd.api.types.is_string_dtype( - df[col] - ) or pd.api.types.is_categorical_dtype(df[col]): + if pd.api.types.is_categorical_dtype(df[col]): + # Ensure '' is in the categories before filling + if '' not in df[col].cat.categories: + df[col] = df[col].cat.add_categories(['']) + df[col] = df[col].fillna('') + elif pd.api.types.is_string_dtype(df[col]): df[col] = df[col].fillna("") elif pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna(0) From 099c6c4f9cea7b540cfba719f6480f9b460cb62b Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:54:28 +0200 Subject: [PATCH 19/87] refactor: Refactor DeltaLakeCallback class to improve code readability and maintainability --- cryptofeed/backends/deltalake.py | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c15c85c10..b9ac6c296 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -128,7 +114,7 @@ async def writer(self): def _convert_fields(self, df: pd.DataFrame): LOG.debug("Converting fields in DataFrame.") self._convert_datetime_fields(df) - self._convert_category_fields(df) + # self._convert_category_fields(df) self._convert_int_fields(df) def _convert_datetime_fields(self, df: pd.DataFrame): @@ -155,8 +141,8 @@ def _convert_category_fields(self, df: pd.DataFrame): if col in df.columns: # Add empty string as a category if it's not already present categories = df[col].unique().tolist() - if '' not in categories: - categories.append('') + if "" not in categories: + categories.append("") df[col] = pd.Categorical(df[col], categories=categories) def _convert_int_fields(self, df: pd.DataFrame): @@ -255,9 +241,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: for col in df.columns: if pd.api.types.is_categorical_dtype(df[col]): # Ensure '' is in the categories before filling - if '' not in df[col].cat.categories: - df[col] = df[col].cat.add_categories(['']) - df[col] = df[col].fillna('') + if "" not in df[col].cat.categories: + df[col] = df[col].cat.add_categories([""]) + df[col] = df[col].fillna("") elif pd.api.types.is_string_dtype(df[col]): df[col] = df[col].fillna("") elif pd.api.types.is_numeric_dtype(df[col]): From d3a9c717a4e788df330538a6e6ed058ad63b4f08 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:00:09 +0200 Subject: [PATCH 20/87] fix: Improve error handling and logging in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 34 +++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index b9ac6c296..0ed17520b 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -219,8 +233,9 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: - # When error is related to timestamp, print the schema of the DataFrame + # When error is related to timestamp, print the schema of the DataFrame and the df LOG.error(f"DataFrame schema:\n{df.dtypes}") + LOG.error(f"DataFrame:\n{df}") LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" @@ -239,19 +254,16 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: return df.fillna(self.none_to) else: for col in df.columns: - if pd.api.types.is_categorical_dtype(df[col]): - # Ensure '' is in the categories before filling - if "" not in df[col].cat.categories: - df[col] = df[col].cat.add_categories([""]) - df[col] = df[col].fillna("") - elif pd.api.types.is_string_dtype(df[col]): + if pd.api.types.is_string_dtype(df[col]): df[col] = df[col].fillna("") elif pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna(0) elif pd.api.types.is_bool_dtype(df[col]): df[col] = df[col].fillna(False) elif pd.api.types.is_datetime64_any_dtype(df[col]): - df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]") + df[col] = df[col].fillna(pd.Timestamp.min) + else: + df[col] = df[col].fillna(None) return df async def _optimize_table(self): From fd825345a05f2e7a0607833c96557693ee165a66 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:07:09 +0200 Subject: [PATCH 21/87] fix: Optimize Delta Lake table by filling null values with empty strings --- cryptofeed/backends/deltalake.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 0ed17520b..ccc8232b5 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -263,7 +249,7 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: elif pd.api.types.is_datetime64_any_dtype(df[col]): df[col] = df[col].fillna(pd.Timestamp.min) else: - df[col] = df[col].fillna(None) + df[col] = df[col].fillna("") return df async def _optimize_table(self): From 1b37d26de3e9c8972ad17bf053b5f7a75ce53b34 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:44:08 +0200 Subject: [PATCH 22/87] fix: optimize handling of missing data in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index ccc8232b5..16a25e2b1 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -249,7 +263,8 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: elif pd.api.types.is_datetime64_any_dtype(df[col]): df[col] = df[col].fillna(pd.Timestamp.min) else: - df[col] = df[col].fillna("") + # For any other data types, use an empty string as a fallback + df[col] = df[col].astype(object).fillna("") return df async def _optimize_table(self): From 81870dc8bd5ad3a16b210b86bee0792393a4ac80 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 03:55:54 +0200 Subject: [PATCH 23/87] feat: Add custom transformations and improve column validation in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 217 +++++++++++++++++++------------ 1 file changed, 131 insertions(+), 86 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 16a25e2b1..e135b87c0 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -49,6 +49,7 @@ def __init__( storage_options: Optional[Dict[str, Any]] = None, numeric_type: Union[type, str] = float, none_to: Any = None, + custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): super().__init__() @@ -65,6 +66,16 @@ def __init__( self.running = True self.numeric_type = numeric_type self.none_to = none_to + self.transformations = [ + self._rename_custom_columns, + self._convert_datetime_columns, + self._convert_int_columns, + self._ensure_partition_columns, + self._handle_missing_values, + self._reorder_columns, + ] + if custom_transformations: + self.transformations.extend(custom_transformations) # Validate configuration parameters self._validate_configuration() @@ -112,76 +123,146 @@ async def writer(self): async with self.read_queue() as updates: if updates: LOG.info(f"Received {len(updates)} updates for processing.") + df = pd.DataFrame(updates) - self._convert_fields(df) - # Reorder columns to put exchange and symbol first - cols = ["exchange", "symbol"] + [ - col for col in df.columns if col not in ["exchange", "symbol"] - ] - df = df[cols] - if self.custom_columns: - df = df.rename(columns=self.custom_columns) + self._transform_columns(df) + self._validate_columns(df) await self._write_batch(df) - def _convert_fields(self, df: pd.DataFrame): - LOG.debug("Converting fields in DataFrame.") - self._convert_datetime_fields(df) - # self._convert_category_fields(df) - self._convert_int_fields(df) + def _validate_columns(self, df: pd.DataFrame): + LOG.debug("Validating DataFrame columns.") + # Check for required columns + required_columns = ["exchange", "symbol", "dt"] + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise ValueError(f"Missing required columns: {', '.join(missing_columns)}") + + # Validate partition columns + for col in self.partition_cols: + if col not in df.columns: + raise ValueError(f"Partition column '{col}' not found in DataFrame") + if df[col].isnull().any(): + raise ValueError(f"Partition column '{col}' contains null values") + + # Validate data types + expected_types = { + "exchange": "object", + "symbol": "object", + "dt": "object", + "timestamp": "datetime64[ms]", + "receipt_timestamp": "datetime64[ms]", + } + for col, expected_type in expected_types.items(): + if col in df.columns and not df[col].dtype == expected_type: + raise TypeError( + f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" + ) + + LOG.debug("DataFrame columns validation completed successfully.") + + def _transform_columns(self, df: pd.DataFrame): + LOG.debug("Transforming columns in DataFrame.") + for transformation in self.transformations: + transformation(df) + + def _rename_custom_columns(self, df: pd.DataFrame): + if self.custom_columns: + LOG.debug("Renaming columns based on custom_columns configuration.") + df.rename(columns=self.custom_columns, inplace=True) + + def _reorder_columns(self, df: pd.DataFrame): + LOG.debug("Reordering columns to prioritize exchange and symbol.") + cols = ["exchange", "symbol"] + [ + col for col in df.columns if col not in ["exchange", "symbol"] + ] + df.reindex(columns=cols, inplace=True) - def _convert_datetime_fields(self, df: pd.DataFrame): - LOG.debug("Converting datetime fields.") + def _convert_datetime_columns(self, df: pd.DataFrame): + LOG.debug("Converting datetime columns.") datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") + df[col] = pd.to_datetime(df[col], unit="ms") + + # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' if "timestamp" in df.columns: df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + elif "receipt_timestamp" in df.columns: + df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d") + else: + LOG.warning("No timestamp column found. Using current date for 'dt'.") + df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d") - def _convert_category_fields(self, df: pd.DataFrame): - LOG.debug("Converting category fields.") - category_columns = [ - "exchange", - "symbol", - "side", - "type", - "status", - "currency", - "liquidity", - ] - for col in category_columns: - if col in df.columns: - # Add empty string as a category if it's not already present - categories = df[col].unique().tolist() - if "" not in categories: - categories.append("") - df[col] = pd.Categorical(df[col], categories=categories) - - def _convert_int_fields(self, df: pd.DataFrame): - LOG.debug("Converting integer fields.") + def _convert_int_columns(self, df: pd.DataFrame): + LOG.debug("Converting integer columns.") int_columns = ["id", "trade_id", "trades"] for col in int_columns: if col in df.columns: - df[col] = df[col].astype("int64") + df[col] = pd.to_numeric(df[col], errors="coerce").astype( + "Int64" + ) # Use nullable integer type - async def _write_batch(self, df: pd.DataFrame): - if df.empty: - LOG.warning("DataFrame is empty. Skipping write operation.") - return - - # Ensure all partition columns are present in the DataFrame + def _ensure_partition_columns(self, df: pd.DataFrame): + LOG.debug("Ensuring all partition columns are present and not null.") for col in self.partition_cols: if col not in df.columns: - if col == "exchange" or col == "symbol": - df[col] = "" # Default to empty string for categorical columns + if col in ["exchange", "symbol"]: + df[col] = "unknown" elif col == "dt": - df[col] = pd.Timestamp.min.strftime( - "%Y-%m-%d" - ) # Default to min date for date columns + # 'dt' should already be created in _convert_datetime_columns + LOG.warning("'dt' column not found. This should not happen.") + df[col] = pd.Timestamp.now().strftime("%Y-%m-%d") else: - df[col] = 0 # Default to 0 for numeric columns + df[col] = "unknown" + + # Fill any remaining null values + if df[col].isnull().any(): + LOG.warning( + f"Found null values in partition column {col}. Filling with default values." + ) + df[col] = df[col].fillna( + "unknown" + if col != "dt" + else pd.Timestamp.now().strftime("%Y-%m-%d") + ) + + def _handle_missing_values(self, df: pd.DataFrame): + LOG.debug("Handling missing values.") + for col in df.columns: + if col in ["exchange", "symbol"]: # Removed 'dt' from this list + # These are partition columns and should never be null + if df[col].isnull().any(): + LOG.warning( + f"Found null values in partition column {col}. Filling with default values." + ) + df[col] = df[col].fillna("unknown") + elif pd.api.types.is_numeric_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else 0 + ) + elif pd.api.types.is_string_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else "" + ) + elif pd.api.types.is_bool_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else False + ) + elif pd.api.types.is_datetime64_any_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else pd.NaT + ) + else: + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else "" + ) + + async def _write_batch(self, df: pd.DataFrame): + if df.empty: + LOG.warning("DataFrame is empty. Skipping write operation.") + return max_retries = 3 retry_delay = 5 # seconds @@ -191,27 +272,11 @@ async def _write_batch(self, df: pd.DataFrame): LOG.info( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." ) - # Debug output the schema of the DataFrame LOG.debug(f"DataFrame schema:\n{df.dtypes}") - # Convert timestamp columns to datetime64[ms] - timestamp_columns = df.select_dtypes(include=["datetime64"]).columns - for col in timestamp_columns: - df[col] = df[col].astype("datetime64[ms]") - - # Convert numeric columns to the specified numeric type - numeric_columns = df.select_dtypes(include=[np.number]).columns - for col in numeric_columns: - df[col] = df[col].astype(self.numeric_type) - - # Handle null values - df = self._handle_null_values(df) - LOG.info( f"Writing batch of {len(df)} records to {self.delta_table_path}" ) - # Debug output the schema of the DataFrame - LOG.debug(f"DataFrame schema before write:\n{df.dtypes}") write_deltalake( self.delta_table_path, @@ -233,10 +298,8 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: - # When error is related to timestamp, print the schema of the DataFrame and the df LOG.error(f"DataFrame schema:\n{df.dtypes}") LOG.error(f"DataFrame:\n{df}") - LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" ) @@ -249,24 +312,6 @@ async def _write_batch(self, df: pd.DataFrame): "Max retries reached. Failed to write batch to Delta Lake." ) - def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: - if self.none_to is not None: - return df.fillna(self.none_to) - else: - for col in df.columns: - if pd.api.types.is_string_dtype(df[col]): - df[col] = df[col].fillna("") - elif pd.api.types.is_numeric_dtype(df[col]): - df[col] = df[col].fillna(0) - elif pd.api.types.is_bool_dtype(df[col]): - df[col] = df[col].fillna(False) - elif pd.api.types.is_datetime64_any_dtype(df[col]): - df[col] = df[col].fillna(pd.Timestamp.min) - else: - # For any other data types, use an empty string as a fallback - df[col] = df[col].astype(object).fillna("") - return df - async def _optimize_table(self): LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From 880c079063d4df2817f7509ffff9c85245406466 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:11:14 +0200 Subject: [PATCH 24/87] fix: Add logging configuration to deltalake backend --- cryptofeed/backends/deltalake.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index e135b87c0..94628ad1e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -10,28 +10,16 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Union -import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +# Add these lines after the imports +logging.basicConfig(level=logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) LOG = logging.getLogger("feedhandler") From e45dee7f3ed1e59c43cbe2f2308696c039279a04 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:19:36 +0200 Subject: [PATCH 25/87] fix: Initialize DeltaLakeCallback and add logging for writer method and _write_batch --- cryptofeed/backends/deltalake.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 94628ad1e..91c139d5b 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -40,6 +40,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): + LOG.debug("Initializing DeltaLakeCallback") super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -107,8 +108,10 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): + LOG.debug("Writer method called") while self.running: async with self.read_queue() as updates: + LOG.debug(f"Read queue returned {len(updates)} updates") if updates: LOG.info(f"Received {len(updates)} updates for processing.") @@ -248,6 +251,7 @@ def _handle_missing_values(self, df: pd.DataFrame): ) async def _write_batch(self, df: pd.DataFrame): + LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}") if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return From fcaa65f4c88ba467e62e34ebcd83eec30814db0e Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:23:20 +0200 Subject: [PATCH 26/87] fix: Change logging levels from DEBUG to WARNING in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 43 ++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 91c139d5b..68250f6c8 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -17,9 +17,10 @@ from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) + # Add these lines after the imports -logging.basicConfig(level=logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) +# logging.basicConfig(level=logging.DEBUG) +# logging.getLogger().setLevel(logging.DEBUG) LOG = logging.getLogger("feedhandler") @@ -40,7 +41,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): - LOG.debug("Initializing DeltaLakeCallback") + LOG.warning("Initializing DeltaLakeCallback") # Changed to warning super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -108,12 +109,16 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): - LOG.debug("Writer method called") + LOG.warning("Writer method called") # Changed to warning while self.running: async with self.read_queue() as updates: - LOG.debug(f"Read queue returned {len(updates)} updates") + LOG.warning( + f"Read queue returned {len(updates)} updates" + ) # Changed to warning if updates: - LOG.info(f"Received {len(updates)} updates for processing.") + LOG.warning( + f"Received {len(updates)} updates for processing." + ) # Changed to warning df = pd.DataFrame(updates) @@ -251,7 +256,9 @@ def _handle_missing_values(self, df: pd.DataFrame): ) async def _write_batch(self, df: pd.DataFrame): - LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}") + LOG.warning( + f"_write_batch called with DataFrame of shape {df.shape}" + ) # Changed to warning if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return @@ -261,14 +268,14 @@ async def _write_batch(self, df: pd.DataFrame): for attempt in range(max_retries): try: - LOG.info( + LOG.warning( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." - ) - LOG.debug(f"DataFrame schema:\n{df.dtypes}") + ) # Changed to warning + LOG.warning(f"DataFrame schema:\n{df.dtypes}") # Changed to warning - LOG.info( + LOG.warning( f"Writing batch of {len(df)} records to {self.delta_table_path}" - ) + ) # Changed to warning write_deltalake( self.delta_table_path, @@ -286,7 +293,7 @@ async def _write_batch(self, df: pd.DataFrame): if self.time_travel: self._update_metadata() - LOG.info("Batch write successful.") + LOG.warning("Batch write successful.") # Changed to warning break # Exit the retry loop if write is successful except Exception as e: @@ -297,7 +304,9 @@ async def _write_batch(self, df: pd.DataFrame): ) if attempt < max_retries - 1: - LOG.info(f"Retrying in {retry_delay} seconds...") + LOG.warning( + f"Retrying in {retry_delay} seconds..." + ) # Changed to warning await asyncio.sleep(retry_delay) else: LOG.error( @@ -305,12 +314,14 @@ async def _write_batch(self, df: pd.DataFrame): ) async def _optimize_table(self): - LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") + LOG.warning( + f"Running OPTIMIZE on table {self.delta_table_path}" + ) # Changed to warning dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) - LOG.info("OPTIMIZE operation completed.") + LOG.warning("OPTIMIZE operation completed.") # Changed to warning def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From 414558816cad0b94f0fced799624db30ec9dca9e Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:49:40 +0200 Subject: [PATCH 27/87] fix: Improve logging and error handling in DeltaLakeCallback writer method --- cryptofeed/backends/deltalake.py | 61 ++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 68250f6c8..81f6f34b8 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -13,9 +13,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) # Add these lines after the imports @@ -109,23 +123,32 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): - LOG.warning("Writer method called") # Changed to warning + LOG.warning("Writer method started") while self.running: - async with self.read_queue() as updates: - LOG.warning( - f"Read queue returned {len(updates)} updates" - ) # Changed to warning - if updates: - LOG.warning( - f"Received {len(updates)} updates for processing." - ) # Changed to warning - - df = pd.DataFrame(updates) - - self._transform_columns(df) - self._validate_columns(df) - - await self._write_batch(df) + try: + async with self.read_queue() as updates: + LOG.warning(f"Read queue returned {len(updates)} updates") + if updates: + LOG.warning(f"Received {len(updates)} updates for processing.") + df = pd.DataFrame(updates) + LOG.warning(f"Created DataFrame with shape: {df.shape}") + + LOG.warning("Starting field transformation") + self._transform_fields(df) + LOG.warning("Field transformation completed") + + LOG.warning("Validating columns") + self._validate_columns(df) + LOG.warning("Columns validation completed") + + LOG.warning("Starting batch write") + await self._write_batch(df) + LOG.warning("Batch write completed") + else: + LOG.warning("No updates received, continuing loop") + except Exception as e: + LOG.error(f"Error in writer method: {e}", exc_info=True) + LOG.warning("Writer method ended") def _validate_columns(self, df: pd.DataFrame): LOG.debug("Validating DataFrame columns.") From 274ca48695b4da3df9e4475163f14e8548e13b26 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:59:06 +0200 Subject: [PATCH 28/87] fix: Refactor field transformation in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 81f6f34b8..0ce9b9116 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -13,23 +13,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) # Add these lines after the imports @@ -134,7 +120,7 @@ async def writer(self): LOG.warning(f"Created DataFrame with shape: {df.shape}") LOG.warning("Starting field transformation") - self._transform_fields(df) + self._transform_columns(df) LOG.warning("Field transformation completed") LOG.warning("Validating columns") From 43bbd3ad63c0258edb6fae5f981625b8008fef4d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 05:15:11 +0200 Subject: [PATCH 29/87] fix: Reorder columns to prioritize exchange and symbol --- cryptofeed/backends/deltalake.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 0ce9b9116..314bae1bf 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -62,7 +62,6 @@ def __init__( self._convert_int_columns, self._ensure_partition_columns, self._handle_missing_values, - self._reorder_columns, ] if custom_transformations: self.transformations.extend(custom_transformations) @@ -179,10 +178,9 @@ def _rename_custom_columns(self, df: pd.DataFrame): def _reorder_columns(self, df: pd.DataFrame): LOG.debug("Reordering columns to prioritize exchange and symbol.") - cols = ["exchange", "symbol"] + [ - col for col in df.columns if col not in ["exchange", "symbol"] - ] - df.reindex(columns=cols, inplace=True) + priority_cols = ["exchange", "symbol"] + other_cols = [col for col in df.columns if col not in priority_cols] + df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): LOG.debug("Converting datetime columns.") From e59f57dbe389f4e12d9d2ad6e57a475febd63354 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 05:22:38 +0200 Subject: [PATCH 30/87] fix: Ensure datetime columns have millisecond precision in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 314bae1bf..ee7027884 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -155,14 +155,18 @@ def _validate_columns(self, df: pd.DataFrame): "exchange": "object", "symbol": "object", "dt": "object", - "timestamp": "datetime64[ms]", - "receipt_timestamp": "datetime64[ms]", + "timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' + "receipt_timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' } for col, expected_type in expected_types.items(): - if col in df.columns and not df[col].dtype == expected_type: - raise TypeError( - f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" - ) + if col in df.columns: + if expected_type.startswith("datetime64"): + # Convert to millisecond precision if it's a datetime column + df[col] = df[col].astype('datetime64[ms]') + if not df[col].dtype == expected_type: + raise TypeError( + f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" + ) LOG.debug("DataFrame columns validation completed successfully.") @@ -187,7 +191,8 @@ def _convert_datetime_columns(self, df: pd.DataFrame): datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - df[col] = pd.to_datetime(df[col], unit="ms") + # Convert to millisecond precision + df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]') # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' if "timestamp" in df.columns: From 4bacc42ecc377a5a4d3301a3dc35692bd59cfb09 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 05:39:57 +0200 Subject: [PATCH 31/87] feat: Ensure datetime columns are in millisecond precision --- cryptofeed/backends/deltalake.py | 95 ++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index ee7027884..f3b8eff50 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame): "exchange": "object", "symbol": "object", "dt": "object", - "timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' - "receipt_timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' + "timestamp": "datetime64[ms]", + "receipt_timestamp": "datetime64[ms]", } for col, expected_type in expected_types.items(): if col in df.columns: - if expected_type.startswith("datetime64"): - # Convert to millisecond precision if it's a datetime column - df[col] = df[col].astype('datetime64[ms]') + if expected_type == "datetime64[ms]": + # Ensure datetime columns are in millisecond precision + df[col] = df[col].astype("datetime64[ms]") if not df[col].dtype == expected_type: raise TypeError( f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" @@ -187,22 +187,47 @@ def _reorder_columns(self, df: pd.DataFrame): df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): - LOG.debug("Converting datetime columns.") + LOG.debug("Converting datetime columns to millisecond precision.") datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - # Convert to millisecond precision - df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]') + # Log sample of original values + LOG.warning( + f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" + ) + # Convert to millisecond precision, handling both string and datetime inputs + df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]") + # Log sample of converted values in readable format + if len(df) > 0: + readable_time = ( + df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + ) + LOG.warning(f"Sample {col} after conversion: {readable_time}") # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' + min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed if "timestamp" in df.columns: - df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + df["dt"] = ( + df["timestamp"] + .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()) + .dt.strftime("%Y-%m-%d") + ) elif "receipt_timestamp" in df.columns: - df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d") + df["dt"] = ( + df["receipt_timestamp"] + .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()) + .dt.strftime("%Y-%m-%d") + ) else: LOG.warning("No timestamp column found. Using current date for 'dt'.") df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d") + # Log sample of 'dt' column + if "dt" in df.columns and len(df) > 0: + LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}") + + LOG.debug("Datetime columns converted to millisecond precision.") + def _convert_int_columns(self, df: pd.DataFrame): LOG.debug("Converting integer columns.") int_columns = ["id", "trade_id", "trades"] @@ -363,8 +388,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRADES """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -381,14 +406,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FUNDING """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category - mark_price: float64 (nullable) - rate: float64 - - next_funding_time: datetime64[ns] (nullable) + - next_funding_time: datetime64[ms] (nullable) - predicted_rate: float64 (nullable) """ @@ -397,8 +422,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TICKER """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -411,8 +436,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): default_key = OPEN_INTEREST """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -424,8 +449,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = LIQUIDATIONS """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -441,8 +466,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): default_key = "book" """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -461,13 +486,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category - - start: datetime64[ns] - - stop: datetime64[ns] + - start: datetime64[ms] + - stop: datetime64[ms] - interval: string - trades: int64 (nullable) - open: float64 @@ -483,8 +508,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): default_key = ORDER_INFO """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -504,8 +529,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRANSACTIONS """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - currency: category @@ -519,8 +544,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = BALANCES """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - currency: category @@ -533,8 +558,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FILLS """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category From d520c0269b532fb23a64af27c8180e2ec681be8e Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 00:42:34 +0200 Subject: [PATCH 32/87] chore: Convert datetime columns to microsecond precision --- .gitignore | 1 + cryptofeed/backends/deltalake.py | 72 ++++++++++++++++---------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index ac64f2b9e..ed13eb5fa 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ ENV/ # PyCharm .idea/ .aider* +.trunk/ diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index f3b8eff50..af5d6b31e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame): "exchange": "object", "symbol": "object", "dt": "object", - "timestamp": "datetime64[ms]", - "receipt_timestamp": "datetime64[ms]", + "timestamp": "datetime64[us]", + "receipt_timestamp": "datetime64[us]", } for col, expected_type in expected_types.items(): if col in df.columns: - if expected_type == "datetime64[ms]": - # Ensure datetime columns are in millisecond precision - df[col] = df[col].astype("datetime64[ms]") + if expected_type == "datetime64[us]": + # Ensure datetime columns are in microsecond precision + df[col] = df[col].astype("datetime64[us]") if not df[col].dtype == expected_type: raise TypeError( f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" @@ -187,7 +187,7 @@ def _reorder_columns(self, df: pd.DataFrame): df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): - LOG.debug("Converting datetime columns to millisecond precision.") + LOG.debug("Converting datetime columns to microsecond precision.") datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: @@ -195,13 +195,11 @@ def _convert_datetime_columns(self, df: pd.DataFrame): LOG.warning( f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" ) - # Convert to millisecond precision, handling both string and datetime inputs - df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]") + # Convert to microsecond precision, handling both string and datetime inputs + df[col] = pd.to_datetime(df[col]).astype("datetime64[us]") # Log sample of converted values in readable format if len(df) > 0: - readable_time = ( - df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] - ) + readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f") LOG.warning(f"Sample {col} after conversion: {readable_time}") # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' @@ -226,7 +224,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): if "dt" in df.columns and len(df) > 0: LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}") - LOG.debug("Datetime columns converted to millisecond precision.") + LOG.debug("Datetime columns converted to microsecond precision.") def _convert_int_columns(self, df: pd.DataFrame): LOG.debug("Converting integer columns.") @@ -388,8 +386,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRADES """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -406,14 +404,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FUNDING """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category - mark_price: float64 (nullable) - rate: float64 - - next_funding_time: datetime64[ms] (nullable) + - next_funding_time: datetime64[us] (nullable) - predicted_rate: float64 (nullable) """ @@ -422,8 +420,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TICKER """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -436,8 +434,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): default_key = OPEN_INTEREST """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -449,8 +447,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = LIQUIDATIONS """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -466,8 +464,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): default_key = "book" """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -486,13 +484,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category - - start: datetime64[ms] - - stop: datetime64[ms] + - start: datetime64[us] + - stop: datetime64[us] - interval: string - trades: int64 (nullable) - open: float64 @@ -508,8 +506,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): default_key = ORDER_INFO """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -529,8 +527,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRANSACTIONS """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - currency: category @@ -544,8 +542,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = BALANCES """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - currency: category @@ -558,8 +556,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FILLS """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category From b11f9a4494f66cb19c9e556082fe09b3f3ea96e0 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 01:04:50 +0200 Subject: [PATCH 33/87] fix: Change log levels from warning to debug for non-critical messages --- cryptofeed/backends/deltalake.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index af5d6b31e..412794626 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -41,7 +41,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): - LOG.warning("Initializing DeltaLakeCallback") # Changed to warning + LOG.warning("Initializing DeltaLakeCallback") super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -192,7 +192,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): for col in datetime_columns: if col in df.columns: # Log sample of original values - LOG.warning( + LOG.debug( f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" ) # Convert to microsecond precision, handling both string and datetime inputs @@ -200,7 +200,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): # Log sample of converted values in readable format if len(df) > 0: readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f") - LOG.warning(f"Sample {col} after conversion: {readable_time}") + LOG.debug(f"Sample {col} after conversion: {readable_time}") # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed @@ -222,7 +222,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): # Log sample of 'dt' column if "dt" in df.columns and len(df) > 0: - LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}") + LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}") LOG.debug("Datetime columns converted to microsecond precision.") @@ -293,7 +293,7 @@ def _handle_missing_values(self, df: pd.DataFrame): async def _write_batch(self, df: pd.DataFrame): LOG.warning( f"_write_batch called with DataFrame of shape {df.shape}" - ) # Changed to warning + ) if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return @@ -305,12 +305,12 @@ async def _write_batch(self, df: pd.DataFrame): try: LOG.warning( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." - ) # Changed to warning - LOG.warning(f"DataFrame schema:\n{df.dtypes}") # Changed to warning + ) + LOG.debug(f"DataFrame schema:\n{df.dtypes}") LOG.warning( f"Writing batch of {len(df)} records to {self.delta_table_path}" - ) # Changed to warning + ) write_deltalake( self.delta_table_path, @@ -328,7 +328,7 @@ async def _write_batch(self, df: pd.DataFrame): if self.time_travel: self._update_metadata() - LOG.warning("Batch write successful.") # Changed to warning + LOG.warning("Batch write successful.") break # Exit the retry loop if write is successful except Exception as e: @@ -341,7 +341,7 @@ async def _write_batch(self, df: pd.DataFrame): if attempt < max_retries - 1: LOG.warning( f"Retrying in {retry_delay} seconds..." - ) # Changed to warning + ) await asyncio.sleep(retry_delay) else: LOG.error( @@ -351,12 +351,12 @@ async def _write_batch(self, df: pd.DataFrame): async def _optimize_table(self): LOG.warning( f"Running OPTIMIZE on table {self.delta_table_path}" - ) # Changed to warning + ) dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) - LOG.warning("OPTIMIZE operation completed.") # Changed to warning + LOG.warning("OPTIMIZE operation completed.") def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From 11893dbfc66171e42f98e8bbc081601f7f11ca5b Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 01:25:25 +0200 Subject: [PATCH 34/87] refactor: Simplify datetime column handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 40 +++++++++++++------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 412794626..d11c3c596 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -205,20 +205,12 @@ def _convert_datetime_columns(self, df: pd.DataFrame): # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed if "timestamp" in df.columns: - df["dt"] = ( - df["timestamp"] - .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()) - .dt.strftime("%Y-%m-%d") - ) + df["dt"] = df["timestamp"].where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date elif "receipt_timestamp" in df.columns: - df["dt"] = ( - df["receipt_timestamp"] - .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()) - .dt.strftime("%Y-%m-%d") - ) + df["dt"] = df["receipt_timestamp"].where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date else: LOG.warning("No timestamp column found. Using current date for 'dt'.") - df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d") + df["dt"] = pd.Timestamp.now().date() # Log sample of 'dt' column if "dt" in df.columns and len(df) > 0: @@ -244,7 +236,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame): elif col == "dt": # 'dt' should already be created in _convert_datetime_columns LOG.warning("'dt' column not found. This should not happen.") - df[col] = pd.Timestamp.now().strftime("%Y-%m-%d") + df[col] = pd.Timestamp.now().date() else: df[col] = "unknown" @@ -256,7 +248,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame): df[col] = df[col].fillna( "unknown" if col != "dt" - else pd.Timestamp.now().strftime("%Y-%m-%d") + else pd.Timestamp.now().date() ) def _handle_missing_values(self, df: pd.DataFrame): @@ -388,7 +380,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - id: int64 (nullable) @@ -406,7 +398,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - mark_price: float64 (nullable) @@ -422,7 +414,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - bid: float64 @@ -436,7 +428,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - open_interest: float64 @@ -449,7 +441,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - side: category @@ -466,7 +458,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - delta: dict (nullable, contains 'bid' and 'ask' updates) @@ -486,7 +478,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - start: datetime64[us] @@ -508,7 +500,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - id: int64 @@ -529,7 +521,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - currency: category - type: category @@ -544,7 +536,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - currency: category - balance: float64 @@ -558,7 +550,7 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - price: float64 From 313a8f829c2ffc012be96e810b804f6250a50d34 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 02:05:38 +0200 Subject: [PATCH 35/87] feat: Add batch processing and flush interval to DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 53 +++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d11c3c596..5bfa7bd24 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -7,6 +7,7 @@ import asyncio import logging +import time from collections import defaultdict from typing import Any, Dict, List, Optional, Union @@ -38,6 +39,8 @@ def __init__( storage_options: Optional[Dict[str, Any]] = None, numeric_type: Union[type, str] = float, none_to: Any = None, + batch_size: int = 1000, + flush_interval: float = 60.0, custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): @@ -67,6 +70,10 @@ def __init__( self.transformations.extend(custom_transformations) # Validate configuration parameters self._validate_configuration() + self.batch_size = batch_size + self.flush_interval = flush_interval + self.batch = [] + self.last_flush_time = time.time() def _validate_configuration(self): if self.optimize_interval <= 0: @@ -115,26 +122,41 @@ async def writer(self): LOG.warning(f"Read queue returned {len(updates)} updates") if updates: LOG.warning(f"Received {len(updates)} updates for processing.") - df = pd.DataFrame(updates) - LOG.warning(f"Created DataFrame with shape: {df.shape}") + self.batch.extend(updates) - LOG.warning("Starting field transformation") - self._transform_columns(df) - LOG.warning("Field transformation completed") - - LOG.warning("Validating columns") - self._validate_columns(df) - LOG.warning("Columns validation completed") - - LOG.warning("Starting batch write") - await self._write_batch(df) - LOG.warning("Batch write completed") + if len(self.batch) >= self.batch_size or (time.time() - self.last_flush_time) >= self.flush_interval: + await self._process_batch() else: - LOG.warning("No updates received, continuing loop") + # Check if we need to flush based on time + if (time.time() - self.last_flush_time) >= self.flush_interval and self.batch: + await self._process_batch() + else: + LOG.warning("No updates received, continuing loop") + await asyncio.sleep(1) # Add a small delay to prevent busy-waiting except Exception as e: LOG.error(f"Error in writer method: {e}", exc_info=True) LOG.warning("Writer method ended") + async def _process_batch(self): + LOG.warning(f"Processing batch of {len(self.batch)} updates") + df = pd.DataFrame(self.batch) + LOG.warning(f"Created DataFrame with shape: {df.shape}") + + LOG.warning("Starting field transformation") + self._transform_columns(df) + LOG.warning("Field transformation completed") + + LOG.warning("Validating columns") + self._validate_columns(df) + LOG.warning("Columns validation completed") + + LOG.warning("Starting batch write") + await self._write_batch(df) + LOG.warning("Batch write completed") + + self.batch = [] + self.last_flush_time = time.time() + def _validate_columns(self, df: pd.DataFrame): LOG.debug("Validating DataFrame columns.") # Check for required columns @@ -357,6 +379,9 @@ def _update_metadata(self): async def stop(self): LOG.info("Stopping DeltaLakeCallback writer.") self.running = False + # Flush any remaining data + if self.batch: + await self._process_batch() def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: if self.time_travel: From f5d3f8c38101ae4f7e0f8a0a943e3d29bd2427f8 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 02:25:43 +0200 Subject: [PATCH 36/87] fix: Update copyright year in demo_deltalake.py --- examples/demo_deltalake.py | 4 ++-- setup.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py index 5c973016e..95d2a405d 100644 --- a/examples/demo_deltalake.py +++ b/examples/demo_deltalake.py @@ -1,9 +1,9 @@ -''' +""" Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com Please see the LICENSE file for the terms and conditions associated with this software. -''' +""" from cryptofeed import FeedHandler from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake from cryptofeed.defines import FUNDING, TICKER, TRADES diff --git a/setup.py b/setup.py index 344573348..fe38f7b7d 100644 --- a/setup.py +++ b/setup.py @@ -7,10 +7,9 @@ import os import sys -from setuptools import Extension, setup -from setuptools import find_packages -from setuptools.command.test import test as TestCommand from Cython.Build import cythonize +from setuptools import Extension, find_packages, setup +from setuptools.command.test import test as TestCommand def get_long_description(): From 85ad6681100035761d38c0cf126da2725940a663 Mon Sep 17 00:00:00 2001 From: Tommy K Date: Sat, 31 Aug 2024 23:37:59 +0200 Subject: [PATCH 37/87] feat(deltalake): Implement Delta Lake backend and add dependencies - Add DeltaLakeCallback class with support for various data types - Implement partitioning, Z-ordering, and time travel features - Add schema documentation for each data type - Include Delta Lake dependencies in setup.py - Create demo file for Delta Lake usage with S3 configuration - Update extras_require in setup.py to include deltalake option --- cryptofeed/backends/deltalake.py | 328 +++++++++++++++++++++++++++++++ examples/demo_deltalake.py | 54 +++++ setup.py | 2 + 3 files changed, 384 insertions(+) create mode 100644 cryptofeed/backends/deltalake.py create mode 100644 examples/demo_deltalake.py diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py new file mode 100644 index 000000000..1fd08e555 --- /dev/null +++ b/cryptofeed/backends/deltalake.py @@ -0,0 +1,328 @@ +''' +Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com + +Please see the LICENSE file for the terms and conditions +associated with this software. +''' +from typing import Optional, List, Dict, Any +import logging +import pandas as pd +from deltalake import DeltaTable, write_deltalake + +from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback +from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK + +LOG = logging.getLogger('feedhandler') + +class DeltaLakeCallback(BackendQueue): + def __init__(self, + base_path: str, + key: Optional[str] = None, + custom_columns: Optional[Dict[str, str]] = None, + partition_cols: Optional[List[str]] = None, + optimize_interval: int = 100, + z_order_cols: Optional[List[str]] = None, + time_travel: bool = False, + storage_options: Optional[Dict[str, Any]] = None, + **kwargs: Any): + super().__init__() + self.key = key or self.default_key + self.base_path = base_path + self.delta_table_path = f"{self.base_path}/{self.key}" + self.custom_columns = custom_columns or {} + self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day'] + self.optimize_interval = optimize_interval + self.z_order_cols = z_order_cols or self._default_z_order_cols() + self.time_travel = time_travel + self.storage_options = storage_options or {} + self.write_count = 0 + self.running = True + + if optimize_interval <= 0: + raise ValueError("optimize_interval must be a positive integer") + + if not isinstance(self.partition_cols, list): + raise TypeError("partition_cols must be a list of strings") + + if not isinstance(self.z_order_cols, list): + raise TypeError("z_order_cols must be a list of strings") + + def _default_z_order_cols(self) -> List[str]: + common_cols = ['exchange', 'symbol', 'timestamp'] + data_specific_cols = { + TRADES: ['price', 'amount'], + FUNDING: ['rate'], + TICKER: ['bid', 'ask'], + OPEN_INTEREST: ['open_interest'], + LIQUIDATIONS: ['quantity', 'price'], + BOOK: [], # Book data is typically queried by timestamp and symbol + CANDLES: ['open', 'close', 'high', 'low'], + ORDER_INFO: ['status', 'price', 'amount'], + TRANSACTIONS: ['type', 'amount'], + BALANCES: ['balance'], + FILLS: ['price', 'amount'] + } + return common_cols + data_specific_cols.get(self.key, []) + + async def writer(self): + while self.running: + async with self.read_queue() as updates: + if updates: + df = pd.DataFrame(updates) + df['date'] = pd.to_datetime(df['timestamp'], unit='s') + df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s') + df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day + + # Reorder columns to put exchange and symbol first + cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']] + df = df[cols] + + if self.custom_columns: + df = df.rename(columns=self.custom_columns) + + await self._write_batch(df) + + async def _write_batch(self, df: pd.DataFrame): + if df.empty: + return + + try: + LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") + write_deltalake( + self.delta_table_path, + df, + mode="append", + partition_by=self.partition_cols, + schema_mode="merge", + storage_options=self.storage_options + ) + self.write_count += 1 + + if self.write_count % self.optimize_interval == 0: + await self._optimize_table() + + if self.time_travel: + self._update_metadata() + + except Exception as e: + LOG.error(f"Error writing to Delta Lake: {e}") + + async def _optimize_table(self): + LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") + dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) + dt.optimize.compact() + if self.z_order_cols: + dt.optimize.z_order(self.z_order_cols) + + def _update_metadata(self): + dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) + LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}") + + async def stop(self): + self.running = False + + def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: + if self.time_travel: + dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) + if timestamp: + return dt.version_at_timestamp(timestamp) + else: + return dt.version() + else: + LOG.warning("Time travel is not enabled for this table") + return None + +class TradeDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = TRADES + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - id: string (nullable) + - side: string + - amount: float64 + - price: float64 + - type: string (nullable) + """ + +class FundingDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = FUNDING + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - mark_price: float64 (nullable) + - rate: float64 + - next_funding_time: datetime64[ns] (nullable) + - predicted_rate: float64 (nullable) + """ + +class TickerDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = TICKER + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - bid: float64 + - ask: float64 + """ + +class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = OPEN_INTEREST + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - open_interest: float64 + """ + +class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = LIQUIDATIONS + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - side: string + - quantity: float64 + - price: float64 + - id: string + - status: string + """ + +class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): + default_key = BOOK + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - delta: dict (nullable, contains 'bid' and 'ask' updates) + - book: dict (contains full order book snapshot when available) + """ + +class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = CANDLES + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - start: datetime64[ns] + - stop: datetime64[ns] + - interval: string + - trades: int64 (nullable) + - open: float64 + - close: float64 + - high: float64 + - low: float64 + - volume: float64 + - closed: bool (nullable) + """ + +class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = ORDER_INFO + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - id: string + - client_order_id: string (nullable) + - side: string + - status: string + - type: string + - price: float64 + - amount: float64 + - remaining: float64 (nullable) + - account: string (nullable) + """ + +class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = TRANSACTIONS + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - currency: string + - type: string + - status: string + - amount: float64 + """ + +class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = BALANCES + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - currency: string + - balance: float64 + - reserved: float64 (nullable) + """ + +class FillsDeltaLake(DeltaLakeCallback, BackendCallback): + default_key = FILLS + """ + Schema: + - timestamp: datetime64[ns] (from 'date' column) + - receipt_timestamp: datetime64[ns] + - year: int32 + - month: int32 + - day: int32 + - exchange: string + - symbol: string + - price: float64 + - amount: float64 + - side: string + - fee: float64 (nullable) + - id: string + - order_id: string + - liquidity: string + - type: string + - account: string (nullable) + """ \ No newline at end of file diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py new file mode 100644 index 000000000..5c973016e --- /dev/null +++ b/examples/demo_deltalake.py @@ -0,0 +1,54 @@ +''' +Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com + +Please see the LICENSE file for the terms and conditions +associated with this software. +''' +from cryptofeed import FeedHandler +from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake +from cryptofeed.defines import FUNDING, TICKER, TRADES +from cryptofeed.exchanges import Binance + + +def main(): + f = FeedHandler() + + # Define the Delta Lake base path (can be local or S3) + delta_base_path = 's3://your-bucket/path/to/delta/tables' + + # S3 storage options (remove if using local storage) + s3_options = { + "AWS_ACCESS_KEY_ID": "your_access_key", + "AWS_SECRET_ACCESS_KEY": "your_secret_key", + "AWS_REGION": "your_region" + } + + # Add Binance feed with Delta Lake callbacks + f.add_feed(Binance( + channels=[TRADES, FUNDING, TICKER], + symbols=['BTC-USDT', 'ETH-USDT'], + callbacks={ + TRADES: TradeDeltaLake( + base_path=delta_base_path, + optimize_interval=50, # More frequent table optimization + time_travel=True, # Enable time travel feature + storage_options=s3_options # Add S3 configuration + ), + FUNDING: FundingDeltaLake( + base_path=delta_base_path, + storage_options=s3_options # Add S3 configuration + ), + TICKER: TickerDeltaLake( + base_path=delta_base_path, + partition_cols=['exchange', 'symbol', 'year', 'month', 'day'], # Custom partitioning + z_order_cols=['timestamp', 'bid', 'ask'], # Enable Z-ordering + storage_options=s3_options # Add S3 configuration + ) + } + )) + + f.run() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/setup.py b/setup.py index adf10f870..344573348 100644 --- a/setup.py +++ b/setup.py @@ -95,6 +95,7 @@ def run_tests(self): "rabbit": ["aio_pika", "pika"], "redis": ["hiredis", "redis>=4.5.1"], "zmq": ["pyzmq"], + "deltalake": ["deltalake>=0.6.1", "pandas"], "all": [ "arctic", "google_cloud_pubsub>=2.4.1", @@ -107,6 +108,7 @@ def run_tests(self): "hiredis", "redis>=4.5.1", "pyzmq", + "deltalake>=0.6.1", ], }, ) From b7a20b93dadf45356b0c489f7290d417a24643ae Mon Sep 17 00:00:00 2001 From: Tommy K Date: Sun, 1 Sep 2024 00:11:55 +0200 Subject: [PATCH 38/87] feat(deltalake): optimize Delta Lake implementation --- cryptofeed/backends/deltalake.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 1fd08e555..bd29037ae 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -5,7 +5,9 @@ associated with this software. ''' from typing import Optional, List, Dict, Any +from collections import defaultdict import logging + import pandas as pd from deltalake import DeltaTable, write_deltalake @@ -228,6 +230,11 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): - delta: dict (nullable, contains 'bid' and 'ask' updates) - book: dict (contains full order book snapshot when available) """ + def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs): + self.snapshots_only = snapshots_only + self.snapshot_interval = snapshot_interval + self.snapshot_count = defaultdict(int) + super().__init__(*args, **kwargs) class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES From 6f81da62365c52d6a5bf6cd9402004843756844d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 00:51:23 +0200 Subject: [PATCH 39/87] fix(deltalake): fix book table name --- cryptofeed/backends/deltalake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index bd29037ae..c55475f2c 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -12,7 +12,7 @@ from deltalake import DeltaTable, write_deltalake from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback -from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK +from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS LOG = logging.getLogger('feedhandler') @@ -217,7 +217,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): """ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): - default_key = BOOK + default_key = "book" """ Schema: - timestamp: datetime64[ns] (from 'date' column) From d12441b27a4bb17e40a831952cfa91f90be8e57d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 01:01:48 +0200 Subject: [PATCH 40/87] fix(deltalake): Fix book name --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c55475f2c..c1b882c34 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -57,7 +57,7 @@ def _default_z_order_cols(self) -> List[str]: TICKER: ['bid', 'ask'], OPEN_INTEREST: ['open_interest'], LIQUIDATIONS: ['quantity', 'price'], - BOOK: [], # Book data is typically queried by timestamp and symbol + "book": [], # Book data is typically queried by timestamp and symbol CANDLES: ['open', 'close', 'high', 'low'], ORDER_INFO: ['status', 'price', 'amount'], TRANSACTIONS: ['type', 'amount'], From d6a2ae8edba04df49e3442d64c0b48cb311d63eb Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 01:21:33 +0200 Subject: [PATCH 41/87] fix(deltalake): Fix numeric type --- cryptofeed/backends/deltalake.py | 35 ++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c1b882c34..fda44563e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -4,9 +4,10 @@ Please see the LICENSE file for the terms and conditions associated with this software. ''' -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Union from collections import defaultdict import logging +import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake @@ -16,16 +17,19 @@ LOG = logging.getLogger('feedhandler') + class DeltaLakeCallback(BackendQueue): - def __init__(self, - base_path: str, - key: Optional[str] = None, + def __init__(self, + base_path: str, + key: Optional[str] = None, custom_columns: Optional[Dict[str, str]] = None, partition_cols: Optional[List[str]] = None, optimize_interval: int = 100, z_order_cols: Optional[List[str]] = None, time_travel: bool = False, storage_options: Optional[Dict[str, Any]] = None, + numeric_type: Union[type, str] = float, + none_to: Any = None, **kwargs: Any): super().__init__() self.key = key or self.default_key @@ -49,6 +53,9 @@ def __init__(self, if not isinstance(self.z_order_cols, list): raise TypeError("z_order_cols must be a list of strings") + self.numeric_type = numeric_type + self.none_to = none_to + def _default_z_order_cols(self) -> List[str]: common_cols = ['exchange', 'symbol', 'timestamp'] data_specific_cols = { @@ -89,6 +96,15 @@ async def _write_batch(self, df: pd.DataFrame): return try: + # Convert numeric columns to the specified numeric type + numeric_columns = df.select_dtypes(include=[np.number]).columns + for col in numeric_columns: + df[col] = df[col].astype(self.numeric_type) + + # Replace None values with the specified value + if self.none_to is not None: + df = df.fillna(self.none_to) + LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") write_deltalake( self.delta_table_path, @@ -134,6 +150,7 @@ def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: LOG.warning("Time travel is not enabled for this table") return None + class TradeDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRADES """ @@ -152,6 +169,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): - type: string (nullable) """ + class FundingDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FUNDING """ @@ -169,6 +187,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): - predicted_rate: float64 (nullable) """ + class TickerDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TICKER """ @@ -184,6 +203,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): - ask: float64 """ + class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): default_key = OPEN_INTEREST """ @@ -198,6 +218,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): - open_interest: float64 """ + class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = LIQUIDATIONS """ @@ -216,6 +237,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): - status: string """ + class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): default_key = "book" """ @@ -236,6 +258,7 @@ def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs self.snapshot_count = defaultdict(int) super().__init__(*args, **kwargs) + class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES """ @@ -259,6 +282,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): - closed: bool (nullable) """ + class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): default_key = ORDER_INFO """ @@ -281,6 +305,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): - account: string (nullable) """ + class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRANSACTIONS """ @@ -297,6 +322,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): - amount: float64 """ + class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = BALANCES """ @@ -312,6 +338,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): - reserved: float64 (nullable) """ + class FillsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FILLS """ From e257d83cb942b96385107c234733d9bac55b4832 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 17:26:24 +0200 Subject: [PATCH 42/87] fix: Ensure timestamp columns have nanosecond precision in DeltaLake backend --- .gitignore | 1 + cryptofeed/backends/deltalake.py | 109 +++++++++++++++++++------------ 2 files changed, 68 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index 5860625f7..ac64f2b9e 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,4 @@ ENV/ # PyCharm .idea/ +.aider* diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index fda44563e..963488b29 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -1,49 +1,60 @@ -''' +""" Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com Please see the LICENSE file for the terms and conditions associated with this software. -''' -from typing import Optional, List, Dict, Any, Union -from collections import defaultdict +""" + import logging -import numpy as np +from collections import defaultdict +from typing import Any, Dict, List, Optional, Union +import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback -from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) + -LOG = logging.getLogger('feedhandler') +LOG = logging.getLogger("feedhandler") class DeltaLakeCallback(BackendQueue): - def __init__(self, - base_path: str, - key: Optional[str] = None, - custom_columns: Optional[Dict[str, str]] = None, - partition_cols: Optional[List[str]] = None, - optimize_interval: int = 100, - z_order_cols: Optional[List[str]] = None, - time_travel: bool = False, - storage_options: Optional[Dict[str, Any]] = None, - numeric_type: Union[type, str] = float, - none_to: Any = None, - **kwargs: Any): + def __init__( + self, + base_path: str, + key: Optional[str] = None, + custom_columns: Optional[Dict[str, str]] = None, + partition_cols: Optional[List[str]] = None, + optimize_interval: int = 100, + z_order_cols: Optional[List[str]] = None, + time_travel: bool = False, + storage_options: Optional[Dict[str, Any]] = None, + numeric_type: Union[type, str] = float, + none_to: Any = None, + **kwargs: Any, + ): super().__init__() self.key = key or self.default_key self.base_path = base_path self.delta_table_path = f"{self.base_path}/{self.key}" self.custom_columns = custom_columns or {} - self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day'] + self.partition_cols = partition_cols or [ + "exchange", + "symbol", + "year", + "month", + "day", + ] self.optimize_interval = optimize_interval self.z_order_cols = z_order_cols or self._default_z_order_cols() self.time_travel = time_travel self.storage_options = storage_options or {} self.write_count = 0 self.running = True - + if optimize_interval <= 0: raise ValueError("optimize_interval must be a positive integer") @@ -57,19 +68,19 @@ def __init__(self, self.none_to = none_to def _default_z_order_cols(self) -> List[str]: - common_cols = ['exchange', 'symbol', 'timestamp'] + common_cols = ["exchange", "symbol", "timestamp"] data_specific_cols = { - TRADES: ['price', 'amount'], - FUNDING: ['rate'], - TICKER: ['bid', 'ask'], - OPEN_INTEREST: ['open_interest'], - LIQUIDATIONS: ['quantity', 'price'], + TRADES: ["price", "amount"], + FUNDING: ["rate"], + TICKER: ["bid", "ask"], + OPEN_INTEREST: ["open_interest"], + LIQUIDATIONS: ["quantity", "price"], "book": [], # Book data is typically queried by timestamp and symbol - CANDLES: ['open', 'close', 'high', 'low'], - ORDER_INFO: ['status', 'price', 'amount'], - TRANSACTIONS: ['type', 'amount'], - BALANCES: ['balance'], - FILLS: ['price', 'amount'] + CANDLES: ["open", "close", "high", "low"], + ORDER_INFO: ["status", "price", "amount"], + TRANSACTIONS: ["type", "amount"], + BALANCES: ["balance"], + FILLS: ["price", "amount"], } return common_cols + data_specific_cols.get(self.key, []) @@ -78,17 +89,25 @@ async def writer(self): async with self.read_queue() as updates: if updates: df = pd.DataFrame(updates) - df['date'] = pd.to_datetime(df['timestamp'], unit='s') - df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s') - df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day - + df["date"] = pd.to_datetime(df["timestamp"], unit="s") + df["receipt_timestamp"] = pd.to_datetime( + df["receipt_timestamp"], unit="s" + ) + df["year"], df["month"], df["day"] = ( + df["date"].dt.year, + df["date"].dt.month, + df["date"].dt.day, + ) + # Reorder columns to put exchange and symbol first - cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']] + cols = ["exchange", "symbol"] + [ + col for col in df.columns if col not in ["exchange", "symbol"] + ] df = df[cols] - + if self.custom_columns: df = df.rename(columns=self.custom_columns) - + await self._write_batch(df) async def _write_batch(self, df: pd.DataFrame): @@ -96,6 +115,11 @@ async def _write_batch(self, df: pd.DataFrame): return try: + # Ensure timestamp columns are in nanosecond precision + timestamp_columns = df.select_dtypes(include=["datetime64"]).columns + for col in timestamp_columns: + df[col] = df[col].astype("datetime64[ns]") + # Convert numeric columns to the specified numeric type numeric_columns = df.select_dtypes(include=[np.number]).columns for col in numeric_columns: @@ -112,7 +136,7 @@ async def _write_batch(self, df: pd.DataFrame): mode="append", partition_by=self.partition_cols, schema_mode="merge", - storage_options=self.storage_options + storage_options=self.storage_options, ) self.write_count += 1 @@ -252,6 +276,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): - delta: dict (nullable, contains 'bid' and 'ask' updates) - book: dict (contains full order book snapshot when available) """ + def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs): self.snapshots_only = snapshots_only self.snapshot_interval = snapshot_interval @@ -359,4 +384,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): - liquidity: string - type: string - account: string (nullable) - """ \ No newline at end of file + """ From 265e7616f4b0d7a71f1a0987ef453ab23add0d78 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 17:47:21 +0200 Subject: [PATCH 43/87] feat: Refactor timestamp column handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 963488b29..fa6050edb 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -13,9 +13,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -115,10 +129,10 @@ async def _write_batch(self, df: pd.DataFrame): return try: - # Ensure timestamp columns are in nanosecond precision + # Convert timestamp columns from ns to us timestamp_columns = df.select_dtypes(include=["datetime64"]).columns for col in timestamp_columns: - df[col] = df[col].astype("datetime64[ns]") + df[col] = df[col].astype("datetime64[us]") # Convert numeric columns to the specified numeric type numeric_columns = df.select_dtypes(include=[np.number]).columns From 00d81ec358ca94d76a741cbd6f484767bb6cc83d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Sun, 1 Sep 2024 18:25:31 +0200 Subject: [PATCH 44/87] fix: Handle null values in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index fa6050edb..dca2dd07d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -82,7 +82,7 @@ def __init__( self.none_to = none_to def _default_z_order_cols(self) -> List[str]: - common_cols = ["exchange", "symbol", "timestamp"] + common_cols = ["timestamp"] data_specific_cols = { TRADES: ["price", "amount"], FUNDING: ["rate"], @@ -96,7 +96,9 @@ def _default_z_order_cols(self) -> List[str]: BALANCES: ["balance"], FILLS: ["price", "amount"], } - return common_cols + data_specific_cols.get(self.key, []) + z_order_cols = common_cols + data_specific_cols.get(self.key, []) + # Remove any columns that are already in partition_cols + return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): while self.running: @@ -139,9 +141,20 @@ async def _write_batch(self, df: pd.DataFrame): for col in numeric_columns: df[col] = df[col].astype(self.numeric_type) - # Replace None values with the specified value + # Handle null values if self.none_to is not None: df = df.fillna(self.none_to) + else: + # Replace None with appropriate default values based on column type + for col in df.columns: + if df[col].dtype == 'object': + df[col] = df[col].fillna('') # Replace None with empty string for object columns + elif df[col].dtype in ['float64', 'int64']: + df[col] = df[col].fillna(0) # Replace None with 0 for numeric columns + elif df[col].dtype == 'bool': + df[col] = df[col].fillna(False) # Replace None with False for boolean columns + elif df[col].dtype == 'datetime64[us]': + df[col] = df[col].fillna(pd.Timestamp.min) # Replace None with minimum timestamp for datetime columns LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") write_deltalake( From d6ed8b90ca0d9fed66071690a89ff287318e02f7 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:51:21 +0200 Subject: [PATCH 45/87] feat: Implement DeltaLake backend for Cryptofeed --- cryptofeed/backends/deltalake.py | 357 +++++++++++++++++-------------- 1 file changed, 200 insertions(+), 157 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index dca2dd07d..d75277e1b 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -5,6 +5,7 @@ associated with this software. """ +import asyncio import logging from collections import defaultdict from typing import Any, Dict, List, Optional, Union @@ -13,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -55,13 +42,7 @@ def __init__( self.base_path = base_path self.delta_table_path = f"{self.base_path}/{self.key}" self.custom_columns = custom_columns or {} - self.partition_cols = partition_cols or [ - "exchange", - "symbol", - "year", - "month", - "day", - ] + self.partition_cols = partition_cols or ["exchange", "symbol", "dt"] self.optimize_interval = optimize_interval self.z_order_cols = z_order_cols or self._default_z_order_cols() self.time_travel = time_travel @@ -69,17 +50,31 @@ def __init__( self.write_count = 0 self.running = True - if optimize_interval <= 0: + # Validate configuration parameters + self._validate_configuration() + + self.numeric_type = numeric_type + self.none_to = none_to + + def _validate_configuration(self): + if self.optimize_interval <= 0: raise ValueError("optimize_interval must be a positive integer") - if not isinstance(self.partition_cols, list): + if not isinstance(self.partition_cols, list) or not all( + isinstance(col, str) for col in self.partition_cols + ): raise TypeError("partition_cols must be a list of strings") - if not isinstance(self.z_order_cols, list): + if not isinstance(self.z_order_cols, list) or not all( + isinstance(col, str) for col in self.z_order_cols + ): raise TypeError("z_order_cols must be a list of strings") - self.numeric_type = numeric_type - self.none_to = none_to + if not isinstance(self.storage_options, dict): + raise TypeError("storage_options must be a dictionary") + + if not isinstance(self.numeric_type, (type, str)): + raise TypeError("numeric_type must be a type or a string") def _default_z_order_cols(self) -> List[str]: common_cols = ["timestamp"] @@ -104,17 +99,9 @@ async def writer(self): while self.running: async with self.read_queue() as updates: if updates: + LOG.info(f"Received {len(updates)} updates for processing.") df = pd.DataFrame(updates) - df["date"] = pd.to_datetime(df["timestamp"], unit="s") - df["receipt_timestamp"] = pd.to_datetime( - df["receipt_timestamp"], unit="s" - ) - df["year"], df["month"], df["day"] = ( - df["date"].dt.year, - df["date"].dt.month, - df["date"].dt.day, - ) - + self._convert_fields(df) # Reorder columns to put exchange and symbol first cols = ["exchange", "symbol"] + [ col for col in df.columns if col not in ["exchange", "symbol"] @@ -126,55 +113,126 @@ async def writer(self): await self._write_batch(df) + def _convert_fields(self, df: pd.DataFrame): + LOG.debug("Converting fields in DataFrame.") + self._convert_datetime_fields(df) + self._convert_category_fields(df) + self._convert_int_fields(df) + + def _convert_datetime_fields(self, df: pd.DataFrame): + LOG.debug("Converting datetime fields.") + datetime_columns = ["timestamp", "receipt_timestamp"] + for col in datetime_columns: + if col in df.columns: + df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]") + if "timestamp" in df.columns: + df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + + def _convert_category_fields(self, df: pd.DataFrame): + LOG.debug("Converting category fields.") + category_columns = [ + "exchange", + "symbol", + "side", + "type", + "status", + "currency", + "liquidity", + ] + for col in category_columns: + if col in df.columns: + df[col] = df[col].astype("category") + + def _convert_int_fields(self, df: pd.DataFrame): + LOG.debug("Converting integer fields.") + int_columns = ["id", "trade_id", "trades"] + for col in int_columns: + if col in df.columns: + df[col] = df[col].astype("int64") + async def _write_batch(self, df: pd.DataFrame): if df.empty: + LOG.warning("DataFrame is empty. Skipping write operation.") return - try: - # Convert timestamp columns from ns to us - timestamp_columns = df.select_dtypes(include=["datetime64"]).columns - for col in timestamp_columns: - df[col] = df[col].astype("datetime64[us]") - - # Convert numeric columns to the specified numeric type - numeric_columns = df.select_dtypes(include=[np.number]).columns - for col in numeric_columns: - df[col] = df[col].astype(self.numeric_type) + max_retries = 3 + retry_delay = 5 # seconds + + for attempt in range(max_retries): + try: + LOG.info( + f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." + ) + # Convert timestamp columns to datetime64[ns] + timestamp_columns = df.select_dtypes(include=["datetime64"]).columns + for col in timestamp_columns: + df[col] = df[col].astype("datetime64[ns]") + + # Convert numeric columns to the specified numeric type + numeric_columns = df.select_dtypes(include=[np.number]).columns + for col in numeric_columns: + df[col] = df[col].astype(self.numeric_type) + + # Handle null values + df = self._handle_null_values(df) + + LOG.info( + f"Writing batch of {len(df)} records to {self.delta_table_path}" + ) + write_deltalake( + self.delta_table_path, + df, + mode="append", + partition_by=self.partition_cols, + schema_mode="merge", + storage_options=self.storage_options, + ) + self.write_count += 1 + + if self.write_count % self.optimize_interval == 0: + await self._optimize_table() + + if self.time_travel: + self._update_metadata() + + LOG.info("Batch write successful.") + break # Exit the retry loop if write is successful + + except Exception as e: + LOG.error( + f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" + ) + if attempt < max_retries - 1: + LOG.info(f"Retrying in {retry_delay} seconds...") + await asyncio.sleep(retry_delay) + else: + LOG.error( + "Max retries reached. Failed to write batch to Delta Lake." + ) - # Handle null values - if self.none_to is not None: - df = df.fillna(self.none_to) - else: - # Replace None with appropriate default values based on column type - for col in df.columns: - if df[col].dtype == 'object': - df[col] = df[col].fillna('') # Replace None with empty string for object columns - elif df[col].dtype in ['float64', 'int64']: - df[col] = df[col].fillna(0) # Replace None with 0 for numeric columns - elif df[col].dtype == 'bool': - df[col] = df[col].fillna(False) # Replace None with False for boolean columns - elif df[col].dtype == 'datetime64[us]': - df[col] = df[col].fillna(pd.Timestamp.min) # Replace None with minimum timestamp for datetime columns - - LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}") - write_deltalake( - self.delta_table_path, - df, - mode="append", - partition_by=self.partition_cols, - schema_mode="merge", - storage_options=self.storage_options, - ) - self.write_count += 1 - - if self.write_count % self.optimize_interval == 0: - await self._optimize_table() - - if self.time_travel: - self._update_metadata() - - except Exception as e: - LOG.error(f"Error writing to Delta Lake: {e}") + def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: + if self.none_to is not None: + return df.fillna(self.none_to) + else: + # Replace None with appropriate default values based on column type + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].fillna( + "" + ) # Replace None with empty string for object columns + elif df[col].dtype in ["float64", "int64"]: + df[col] = df[col].fillna( + 0 + ) # Replace None with 0 for numeric columns + elif df[col].dtype == "bool": + df[col] = df[col].fillna( + False + ) # Replace None with False for boolean columns + elif df[col].dtype == "datetime64[ns]": + df[col] = df[col].fillna( + pd.Timestamp.min + ) # Replace None with minimum timestamp for datetime columns + return df async def _optimize_table(self): LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") @@ -182,21 +240,27 @@ async def _optimize_table(self): dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) + LOG.info("OPTIMIZE operation completed.") def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}") async def stop(self): + LOG.info("Stopping DeltaLakeCallback writer.") self.running = False def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: if self.time_travel: dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) if timestamp: - return dt.version_at_timestamp(timestamp) + version = dt.version_at_timestamp(timestamp) + LOG.info(f"Retrieved version {version} for timestamp {timestamp}.") + return version else: - return dt.version() + version = dt.version() + LOG.info(f"Retrieved current version {version}.") + return version else: LOG.warning("Time travel is not enabled for this table") return None @@ -208,16 +272,15 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string - - id: string (nullable) - - side: string + - dt: string + - exchange: category + - symbol: category + - id: int64 (nullable) + - side: category - amount: float64 - price: float64 - - type: string (nullable) + - type: category (nullable) + - trade_id: int64 """ @@ -227,11 +290,9 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - mark_price: float64 (nullable) - rate: float64 - next_funding_time: datetime64[ns] (nullable) @@ -245,11 +306,9 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - bid: float64 - ask: float64 """ @@ -261,11 +320,9 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - open_interest: float64 """ @@ -276,16 +333,14 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string - - side: string + - dt: string + - exchange: category + - symbol: category + - side: category - quantity: float64 - price: float64 - - id: string - - status: string + - id: int64 + - status: category """ @@ -295,11 +350,9 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - delta: dict (nullable, contains 'bid' and 'ask' updates) - book: dict (contains full order book snapshot when available) """ @@ -317,11 +370,9 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - start: datetime64[ns] - stop: datetime64[ns] - interval: string @@ -341,16 +392,14 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string - - id: string + - dt: string + - exchange: category + - symbol: category + - id: int64 - client_order_id: string (nullable) - - side: string - - status: string - - type: string + - side: category + - status: category + - type: category - price: float64 - amount: float64 - remaining: float64 (nullable) @@ -364,13 +413,11 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - currency: string - - type: string - - status: string + - dt: string + - exchange: category + - currency: category + - type: category + - status: category - amount: float64 """ @@ -381,11 +428,9 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - currency: string + - dt: string + - exchange: category + - currency: category - balance: float64 - reserved: float64 (nullable) """ @@ -397,18 +442,16 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[ns] (from 'date' column) - receipt_timestamp: datetime64[ns] - - year: int32 - - month: int32 - - day: int32 - - exchange: string - - symbol: string + - dt: string + - exchange: category + - symbol: category - price: float64 - amount: float64 - - side: string + - side: category - fee: float64 (nullable) - - id: string - - order_id: string - - liquidity: string - - type: string + - id: int64 + - order_id: int64 + - liquidity: category + - type: category - account: string (nullable) """ From 16f282588b47c20a64b3981cb97a1eb670f5bb9f Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 12:57:16 +0200 Subject: [PATCH 46/87] fix: Refactor DeltaLakeCallback class --- cryptofeed/backends/deltalake.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d75277e1b..6e158f364 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -49,12 +63,10 @@ def __init__( self.storage_options = storage_options or {} self.write_count = 0 self.running = True - - # Validate configuration parameters - self._validate_configuration() - self.numeric_type = numeric_type self.none_to = none_to + # Validate configuration parameters + self._validate_configuration() def _validate_configuration(self): if self.optimize_interval <= 0: From d51babc3ed1975075222509b27a7f9e7c1ea350e Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 13:13:37 +0200 Subject: [PATCH 47/87] fix: Add debug logging for DataFrame schema in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 6e158f364..7991f84cb 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -175,6 +175,9 @@ async def _write_batch(self, df: pd.DataFrame): LOG.info( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." ) + # Debug output the schema of the DataFrame + LOG.debug(f"DataFrame schema:\n{df.dtypes}") + # Convert timestamp columns to datetime64[ns] timestamp_columns = df.select_dtypes(include=["datetime64"]).columns for col in timestamp_columns: @@ -191,6 +194,9 @@ async def _write_batch(self, df: pd.DataFrame): LOG.info( f"Writing batch of {len(df)} records to {self.delta_table_path}" ) + # Debug output the schema of the DataFrame + LOG.debug(f"DataFrame schema before write:\n{df.dtypes}") + write_deltalake( self.delta_table_path, df, From 2a7712ec6002b58e4d3e6028b9a08f09895d8e29 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 13:20:07 +0200 Subject: [PATCH 48/87] fix: Add DataFrame schema logging when timestamp-related error occurs during Delta Lake write --- cryptofeed/backends/deltalake.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 7991f84cb..ffb47db7d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -217,9 +217,13 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: + # When error is related to timestamp, print the schema of the DataFrame + if "timestamp" in str(e): + LOG.error(f"DataFrame schema:\n{df.dtypes}") LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" ) + if attempt < max_retries - 1: LOG.info(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) From f611215ba6b8dfa8ef1ef8d09fddf3ebde6430b5 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:36:58 +0200 Subject: [PATCH 49/87] fix: convert timestamp columns to datetime64[ms] --- cryptofeed/backends/deltalake.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index ffb47db7d..57084ec1a 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -136,7 +136,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame): datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]") + df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") if "timestamp" in df.columns: df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") @@ -178,10 +178,10 @@ async def _write_batch(self, df: pd.DataFrame): # Debug output the schema of the DataFrame LOG.debug(f"DataFrame schema:\n{df.dtypes}") - # Convert timestamp columns to datetime64[ns] + # Convert timestamp columns to datetime64[ms] timestamp_columns = df.select_dtypes(include=["datetime64"]).columns for col in timestamp_columns: - df[col] = df[col].astype("datetime64[ns]") + df[col] = df[col].astype("datetime64[ms]") # Convert numeric columns to the specified numeric type numeric_columns = df.select_dtypes(include=[np.number]).columns From a16f737c873c257cc0dac8e7b4ed25dfe348828d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 2 Sep 2024 16:59:19 +0200 Subject: [PATCH 50/87] fix: Ensure all partition columns are present in the DataFrame --- cryptofeed/backends/deltalake.py | 36 +++++++++++++++----------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 57084ec1a..32e9a902a 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -167,6 +153,18 @@ async def _write_batch(self, df: pd.DataFrame): LOG.warning("DataFrame is empty. Skipping write operation.") return + # Ensure all partition columns are present in the DataFrame + for col in self.partition_cols: + if col not in df.columns: + if col == "exchange" or col == "symbol": + df[col] = "" # Default to empty string for categorical columns + elif col == "dt": + df[col] = pd.Timestamp.min.strftime( + "%Y-%m-%d" + ) # Default to min date for date columns + else: + df[col] = 0 # Default to 0 for numeric columns + max_retries = 3 retry_delay = 5 # seconds @@ -218,8 +216,8 @@ async def _write_batch(self, df: pd.DataFrame): except Exception as e: # When error is related to timestamp, print the schema of the DataFrame - if "timestamp" in str(e): - LOG.error(f"DataFrame schema:\n{df.dtypes}") + LOG.error(f"DataFrame schema:\n{df.dtypes}") + LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" ) From 4173cc2dd03e8b63f01ea5ef1d7a03fb3edfb036 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:00:25 +0200 Subject: [PATCH 51/87] fix: convert timestamp column to datetime64[ms] format --- cryptofeed/backends/deltalake.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 32e9a902a..cb9f98328 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -124,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame): if col in df.columns: df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") if "timestamp" in df.columns: - df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + df["dt"] = df["timestamp"].dt.date.astype("string") def _convert_category_fields(self, df: pd.DataFrame): LOG.debug("Converting category fields.") @@ -248,9 +262,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: df[col] = df[col].fillna( False ) # Replace None with False for boolean columns - elif df[col].dtype == "datetime64[ns]": + elif df[col].dtype == "datetime64[ms]": df[col] = df[col].fillna( - pd.Timestamp.min + pd.Timestamp.min.astype("datetime64[ms]") ) # Replace None with minimum timestamp for datetime columns return df From b880bfa7c3f6a5cec83bd0e9dfe8f9b921b18bdb Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:11:45 +0200 Subject: [PATCH 52/87] feat: Convert timestamp column to date string format in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index cb9f98328..d05ebb06d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -138,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame): if col in df.columns: df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") if "timestamp" in df.columns: - df["dt"] = df["timestamp"].dt.date.astype("string") + df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") def _convert_category_fields(self, df: pd.DataFrame): LOG.debug("Converting category fields.") From b02e46ccb7aeb63056578fd5c8eb80f574476811 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:32:15 +0200 Subject: [PATCH 53/87] refactor: Simplify null value handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d05ebb06d..e38a755c6 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -248,25 +248,18 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: if self.none_to is not None: return df.fillna(self.none_to) else: - # Replace None with appropriate default values based on column type for col in df.columns: - if df[col].dtype == "object": - df[col] = df[col].fillna( - "" - ) # Replace None with empty string for object columns - elif df[col].dtype in ["float64", "int64"]: - df[col] = df[col].fillna( - 0 - ) # Replace None with 0 for numeric columns - elif df[col].dtype == "bool": - df[col] = df[col].fillna( - False - ) # Replace None with False for boolean columns - elif df[col].dtype == "datetime64[ms]": - df[col] = df[col].fillna( - pd.Timestamp.min.astype("datetime64[ms]") - ) # Replace None with minimum timestamp for datetime columns - return df + if pd.api.types.is_string_dtype( + df[col] + ) or pd.api.types.is_categorical_dtype(df[col]): + df[col] = df[col].fillna("") + elif pd.api.types.is_numeric_dtype(df[col]): + df[col] = df[col].fillna(0) + elif pd.api.types.is_bool_dtype(df[col]): + df[col] = df[col].fillna(False) + elif pd.api.types.is_datetime64_any_dtype(df[col]): + df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]") + return df async def _optimize_table(self): LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") From 358c162478de9cd828797a90fcc640087adfd7b0 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:42:41 +0200 Subject: [PATCH 54/87] fix: Ensure empty string is a category in categorical columns and handle null values correctly --- cryptofeed/backends/deltalake.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index e38a755c6..c15c85c10 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -153,7 +153,11 @@ def _convert_category_fields(self, df: pd.DataFrame): ] for col in category_columns: if col in df.columns: - df[col] = df[col].astype("category") + # Add empty string as a category if it's not already present + categories = df[col].unique().tolist() + if '' not in categories: + categories.append('') + df[col] = pd.Categorical(df[col], categories=categories) def _convert_int_fields(self, df: pd.DataFrame): LOG.debug("Converting integer fields.") @@ -249,9 +253,12 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: return df.fillna(self.none_to) else: for col in df.columns: - if pd.api.types.is_string_dtype( - df[col] - ) or pd.api.types.is_categorical_dtype(df[col]): + if pd.api.types.is_categorical_dtype(df[col]): + # Ensure '' is in the categories before filling + if '' not in df[col].cat.categories: + df[col] = df[col].cat.add_categories(['']) + df[col] = df[col].fillna('') + elif pd.api.types.is_string_dtype(df[col]): df[col] = df[col].fillna("") elif pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna(0) From 90a1559bdc7cf3849e266f25c3d045156eba5207 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:54:28 +0200 Subject: [PATCH 55/87] refactor: Refactor DeltaLakeCallback class to improve code readability and maintainability --- cryptofeed/backends/deltalake.py | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c15c85c10..b9ac6c296 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -128,7 +114,7 @@ async def writer(self): def _convert_fields(self, df: pd.DataFrame): LOG.debug("Converting fields in DataFrame.") self._convert_datetime_fields(df) - self._convert_category_fields(df) + # self._convert_category_fields(df) self._convert_int_fields(df) def _convert_datetime_fields(self, df: pd.DataFrame): @@ -155,8 +141,8 @@ def _convert_category_fields(self, df: pd.DataFrame): if col in df.columns: # Add empty string as a category if it's not already present categories = df[col].unique().tolist() - if '' not in categories: - categories.append('') + if "" not in categories: + categories.append("") df[col] = pd.Categorical(df[col], categories=categories) def _convert_int_fields(self, df: pd.DataFrame): @@ -255,9 +241,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: for col in df.columns: if pd.api.types.is_categorical_dtype(df[col]): # Ensure '' is in the categories before filling - if '' not in df[col].cat.categories: - df[col] = df[col].cat.add_categories(['']) - df[col] = df[col].fillna('') + if "" not in df[col].cat.categories: + df[col] = df[col].cat.add_categories([""]) + df[col] = df[col].fillna("") elif pd.api.types.is_string_dtype(df[col]): df[col] = df[col].fillna("") elif pd.api.types.is_numeric_dtype(df[col]): From 1365432523c91dab6aa2b3bb3c58fd638929a55d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:00:09 +0200 Subject: [PATCH 56/87] fix: Improve error handling and logging in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 34 +++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index b9ac6c296..0ed17520b 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -219,8 +233,9 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: - # When error is related to timestamp, print the schema of the DataFrame + # When error is related to timestamp, print the schema of the DataFrame and the df LOG.error(f"DataFrame schema:\n{df.dtypes}") + LOG.error(f"DataFrame:\n{df}") LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" @@ -239,19 +254,16 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: return df.fillna(self.none_to) else: for col in df.columns: - if pd.api.types.is_categorical_dtype(df[col]): - # Ensure '' is in the categories before filling - if "" not in df[col].cat.categories: - df[col] = df[col].cat.add_categories([""]) - df[col] = df[col].fillna("") - elif pd.api.types.is_string_dtype(df[col]): + if pd.api.types.is_string_dtype(df[col]): df[col] = df[col].fillna("") elif pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].fillna(0) elif pd.api.types.is_bool_dtype(df[col]): df[col] = df[col].fillna(False) elif pd.api.types.is_datetime64_any_dtype(df[col]): - df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]") + df[col] = df[col].fillna(pd.Timestamp.min) + else: + df[col] = df[col].fillna(None) return df async def _optimize_table(self): From ac5e61a553d56be1468446de0b1894c876ab87fb Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:07:09 +0200 Subject: [PATCH 57/87] fix: Optimize Delta Lake table by filling null values with empty strings --- cryptofeed/backends/deltalake.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 0ed17520b..ccc8232b5 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,23 +14,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) LOG = logging.getLogger("feedhandler") @@ -263,7 +249,7 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: elif pd.api.types.is_datetime64_any_dtype(df[col]): df[col] = df[col].fillna(pd.Timestamp.min) else: - df[col] = df[col].fillna(None) + df[col] = df[col].fillna("") return df async def _optimize_table(self): From 3338d3cb19c16e7bb7c95bffe0362a04fd894036 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:44:08 +0200 Subject: [PATCH 58/87] fix: optimize handling of missing data in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index ccc8232b5..16a25e2b1 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -14,9 +14,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) LOG = logging.getLogger("feedhandler") @@ -249,7 +263,8 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: elif pd.api.types.is_datetime64_any_dtype(df[col]): df[col] = df[col].fillna(pd.Timestamp.min) else: - df[col] = df[col].fillna("") + # For any other data types, use an empty string as a fallback + df[col] = df[col].astype(object).fillna("") return df async def _optimize_table(self): From 7c93b11688095577ca7fcbc583abe5799a15b406 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 03:55:54 +0200 Subject: [PATCH 59/87] feat: Add custom transformations and improve column validation in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 217 +++++++++++++++++++------------ 1 file changed, 131 insertions(+), 86 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 16a25e2b1..e135b87c0 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -49,6 +49,7 @@ def __init__( storage_options: Optional[Dict[str, Any]] = None, numeric_type: Union[type, str] = float, none_to: Any = None, + custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): super().__init__() @@ -65,6 +66,16 @@ def __init__( self.running = True self.numeric_type = numeric_type self.none_to = none_to + self.transformations = [ + self._rename_custom_columns, + self._convert_datetime_columns, + self._convert_int_columns, + self._ensure_partition_columns, + self._handle_missing_values, + self._reorder_columns, + ] + if custom_transformations: + self.transformations.extend(custom_transformations) # Validate configuration parameters self._validate_configuration() @@ -112,76 +123,146 @@ async def writer(self): async with self.read_queue() as updates: if updates: LOG.info(f"Received {len(updates)} updates for processing.") + df = pd.DataFrame(updates) - self._convert_fields(df) - # Reorder columns to put exchange and symbol first - cols = ["exchange", "symbol"] + [ - col for col in df.columns if col not in ["exchange", "symbol"] - ] - df = df[cols] - if self.custom_columns: - df = df.rename(columns=self.custom_columns) + self._transform_columns(df) + self._validate_columns(df) await self._write_batch(df) - def _convert_fields(self, df: pd.DataFrame): - LOG.debug("Converting fields in DataFrame.") - self._convert_datetime_fields(df) - # self._convert_category_fields(df) - self._convert_int_fields(df) + def _validate_columns(self, df: pd.DataFrame): + LOG.debug("Validating DataFrame columns.") + # Check for required columns + required_columns = ["exchange", "symbol", "dt"] + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise ValueError(f"Missing required columns: {', '.join(missing_columns)}") + + # Validate partition columns + for col in self.partition_cols: + if col not in df.columns: + raise ValueError(f"Partition column '{col}' not found in DataFrame") + if df[col].isnull().any(): + raise ValueError(f"Partition column '{col}' contains null values") + + # Validate data types + expected_types = { + "exchange": "object", + "symbol": "object", + "dt": "object", + "timestamp": "datetime64[ms]", + "receipt_timestamp": "datetime64[ms]", + } + for col, expected_type in expected_types.items(): + if col in df.columns and not df[col].dtype == expected_type: + raise TypeError( + f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" + ) + + LOG.debug("DataFrame columns validation completed successfully.") + + def _transform_columns(self, df: pd.DataFrame): + LOG.debug("Transforming columns in DataFrame.") + for transformation in self.transformations: + transformation(df) + + def _rename_custom_columns(self, df: pd.DataFrame): + if self.custom_columns: + LOG.debug("Renaming columns based on custom_columns configuration.") + df.rename(columns=self.custom_columns, inplace=True) + + def _reorder_columns(self, df: pd.DataFrame): + LOG.debug("Reordering columns to prioritize exchange and symbol.") + cols = ["exchange", "symbol"] + [ + col for col in df.columns if col not in ["exchange", "symbol"] + ] + df.reindex(columns=cols, inplace=True) - def _convert_datetime_fields(self, df: pd.DataFrame): - LOG.debug("Converting datetime fields.") + def _convert_datetime_columns(self, df: pd.DataFrame): + LOG.debug("Converting datetime columns.") datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]") + df[col] = pd.to_datetime(df[col], unit="ms") + + # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' if "timestamp" in df.columns: df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + elif "receipt_timestamp" in df.columns: + df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d") + else: + LOG.warning("No timestamp column found. Using current date for 'dt'.") + df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d") - def _convert_category_fields(self, df: pd.DataFrame): - LOG.debug("Converting category fields.") - category_columns = [ - "exchange", - "symbol", - "side", - "type", - "status", - "currency", - "liquidity", - ] - for col in category_columns: - if col in df.columns: - # Add empty string as a category if it's not already present - categories = df[col].unique().tolist() - if "" not in categories: - categories.append("") - df[col] = pd.Categorical(df[col], categories=categories) - - def _convert_int_fields(self, df: pd.DataFrame): - LOG.debug("Converting integer fields.") + def _convert_int_columns(self, df: pd.DataFrame): + LOG.debug("Converting integer columns.") int_columns = ["id", "trade_id", "trades"] for col in int_columns: if col in df.columns: - df[col] = df[col].astype("int64") + df[col] = pd.to_numeric(df[col], errors="coerce").astype( + "Int64" + ) # Use nullable integer type - async def _write_batch(self, df: pd.DataFrame): - if df.empty: - LOG.warning("DataFrame is empty. Skipping write operation.") - return - - # Ensure all partition columns are present in the DataFrame + def _ensure_partition_columns(self, df: pd.DataFrame): + LOG.debug("Ensuring all partition columns are present and not null.") for col in self.partition_cols: if col not in df.columns: - if col == "exchange" or col == "symbol": - df[col] = "" # Default to empty string for categorical columns + if col in ["exchange", "symbol"]: + df[col] = "unknown" elif col == "dt": - df[col] = pd.Timestamp.min.strftime( - "%Y-%m-%d" - ) # Default to min date for date columns + # 'dt' should already be created in _convert_datetime_columns + LOG.warning("'dt' column not found. This should not happen.") + df[col] = pd.Timestamp.now().strftime("%Y-%m-%d") else: - df[col] = 0 # Default to 0 for numeric columns + df[col] = "unknown" + + # Fill any remaining null values + if df[col].isnull().any(): + LOG.warning( + f"Found null values in partition column {col}. Filling with default values." + ) + df[col] = df[col].fillna( + "unknown" + if col != "dt" + else pd.Timestamp.now().strftime("%Y-%m-%d") + ) + + def _handle_missing_values(self, df: pd.DataFrame): + LOG.debug("Handling missing values.") + for col in df.columns: + if col in ["exchange", "symbol"]: # Removed 'dt' from this list + # These are partition columns and should never be null + if df[col].isnull().any(): + LOG.warning( + f"Found null values in partition column {col}. Filling with default values." + ) + df[col] = df[col].fillna("unknown") + elif pd.api.types.is_numeric_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else 0 + ) + elif pd.api.types.is_string_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else "" + ) + elif pd.api.types.is_bool_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else False + ) + elif pd.api.types.is_datetime64_any_dtype(df[col]): + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else pd.NaT + ) + else: + df[col] = df[col].fillna( + self.none_to if self.none_to is not None else "" + ) + + async def _write_batch(self, df: pd.DataFrame): + if df.empty: + LOG.warning("DataFrame is empty. Skipping write operation.") + return max_retries = 3 retry_delay = 5 # seconds @@ -191,27 +272,11 @@ async def _write_batch(self, df: pd.DataFrame): LOG.info( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." ) - # Debug output the schema of the DataFrame LOG.debug(f"DataFrame schema:\n{df.dtypes}") - # Convert timestamp columns to datetime64[ms] - timestamp_columns = df.select_dtypes(include=["datetime64"]).columns - for col in timestamp_columns: - df[col] = df[col].astype("datetime64[ms]") - - # Convert numeric columns to the specified numeric type - numeric_columns = df.select_dtypes(include=[np.number]).columns - for col in numeric_columns: - df[col] = df[col].astype(self.numeric_type) - - # Handle null values - df = self._handle_null_values(df) - LOG.info( f"Writing batch of {len(df)} records to {self.delta_table_path}" ) - # Debug output the schema of the DataFrame - LOG.debug(f"DataFrame schema before write:\n{df.dtypes}") write_deltalake( self.delta_table_path, @@ -233,10 +298,8 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: - # When error is related to timestamp, print the schema of the DataFrame and the df LOG.error(f"DataFrame schema:\n{df.dtypes}") LOG.error(f"DataFrame:\n{df}") - LOG.error( f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" ) @@ -249,24 +312,6 @@ async def _write_batch(self, df: pd.DataFrame): "Max retries reached. Failed to write batch to Delta Lake." ) - def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame: - if self.none_to is not None: - return df.fillna(self.none_to) - else: - for col in df.columns: - if pd.api.types.is_string_dtype(df[col]): - df[col] = df[col].fillna("") - elif pd.api.types.is_numeric_dtype(df[col]): - df[col] = df[col].fillna(0) - elif pd.api.types.is_bool_dtype(df[col]): - df[col] = df[col].fillna(False) - elif pd.api.types.is_datetime64_any_dtype(df[col]): - df[col] = df[col].fillna(pd.Timestamp.min) - else: - # For any other data types, use an empty string as a fallback - df[col] = df[col].astype(object).fillna("") - return df - async def _optimize_table(self): LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From bdea1ff0716b15564dab43c445d681de1680a0c7 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:11:14 +0200 Subject: [PATCH 60/87] fix: Add logging configuration to deltalake backend --- cryptofeed/backends/deltalake.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index e135b87c0..94628ad1e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -10,28 +10,16 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Union -import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +# Add these lines after the imports +logging.basicConfig(level=logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) LOG = logging.getLogger("feedhandler") From 11c7b222c0e703795314f0419b787bc5bf3f5b80 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:19:36 +0200 Subject: [PATCH 61/87] fix: Initialize DeltaLakeCallback and add logging for writer method and _write_batch --- cryptofeed/backends/deltalake.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 94628ad1e..91c139d5b 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -40,6 +40,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): + LOG.debug("Initializing DeltaLakeCallback") super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -107,8 +108,10 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): + LOG.debug("Writer method called") while self.running: async with self.read_queue() as updates: + LOG.debug(f"Read queue returned {len(updates)} updates") if updates: LOG.info(f"Received {len(updates)} updates for processing.") @@ -248,6 +251,7 @@ def _handle_missing_values(self, df: pd.DataFrame): ) async def _write_batch(self, df: pd.DataFrame): + LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}") if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return From d0ccd929d68daf95d1f307d56e1a1a7c0bc0e912 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:23:20 +0200 Subject: [PATCH 62/87] fix: Change logging levels from DEBUG to WARNING in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 43 ++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 91c139d5b..68250f6c8 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -17,9 +17,10 @@ from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) + # Add these lines after the imports -logging.basicConfig(level=logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) +# logging.basicConfig(level=logging.DEBUG) +# logging.getLogger().setLevel(logging.DEBUG) LOG = logging.getLogger("feedhandler") @@ -40,7 +41,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): - LOG.debug("Initializing DeltaLakeCallback") + LOG.warning("Initializing DeltaLakeCallback") # Changed to warning super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -108,12 +109,16 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): - LOG.debug("Writer method called") + LOG.warning("Writer method called") # Changed to warning while self.running: async with self.read_queue() as updates: - LOG.debug(f"Read queue returned {len(updates)} updates") + LOG.warning( + f"Read queue returned {len(updates)} updates" + ) # Changed to warning if updates: - LOG.info(f"Received {len(updates)} updates for processing.") + LOG.warning( + f"Received {len(updates)} updates for processing." + ) # Changed to warning df = pd.DataFrame(updates) @@ -251,7 +256,9 @@ def _handle_missing_values(self, df: pd.DataFrame): ) async def _write_batch(self, df: pd.DataFrame): - LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}") + LOG.warning( + f"_write_batch called with DataFrame of shape {df.shape}" + ) # Changed to warning if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return @@ -261,14 +268,14 @@ async def _write_batch(self, df: pd.DataFrame): for attempt in range(max_retries): try: - LOG.info( + LOG.warning( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." - ) - LOG.debug(f"DataFrame schema:\n{df.dtypes}") + ) # Changed to warning + LOG.warning(f"DataFrame schema:\n{df.dtypes}") # Changed to warning - LOG.info( + LOG.warning( f"Writing batch of {len(df)} records to {self.delta_table_path}" - ) + ) # Changed to warning write_deltalake( self.delta_table_path, @@ -286,7 +293,7 @@ async def _write_batch(self, df: pd.DataFrame): if self.time_travel: self._update_metadata() - LOG.info("Batch write successful.") + LOG.warning("Batch write successful.") # Changed to warning break # Exit the retry loop if write is successful except Exception as e: @@ -297,7 +304,9 @@ async def _write_batch(self, df: pd.DataFrame): ) if attempt < max_retries - 1: - LOG.info(f"Retrying in {retry_delay} seconds...") + LOG.warning( + f"Retrying in {retry_delay} seconds..." + ) # Changed to warning await asyncio.sleep(retry_delay) else: LOG.error( @@ -305,12 +314,14 @@ async def _write_batch(self, df: pd.DataFrame): ) async def _optimize_table(self): - LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}") + LOG.warning( + f"Running OPTIMIZE on table {self.delta_table_path}" + ) # Changed to warning dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) - LOG.info("OPTIMIZE operation completed.") + LOG.warning("OPTIMIZE operation completed.") # Changed to warning def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From a8d631017c85849b3820911ff82ae0e6420f649f Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:49:40 +0200 Subject: [PATCH 63/87] fix: Improve logging and error handling in DeltaLakeCallback writer method --- cryptofeed/backends/deltalake.py | 61 ++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 68250f6c8..81f6f34b8 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -13,9 +13,23 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue -from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, - OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) +from cryptofeed.backends.backend import ( + BackendBookCallback, + BackendCallback, + BackendQueue, +) +from cryptofeed.defines import ( + BALANCES, + CANDLES, + FILLS, + FUNDING, + LIQUIDATIONS, + OPEN_INTEREST, + ORDER_INFO, + TICKER, + TRADES, + TRANSACTIONS, +) # Add these lines after the imports @@ -109,23 +123,32 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): - LOG.warning("Writer method called") # Changed to warning + LOG.warning("Writer method started") while self.running: - async with self.read_queue() as updates: - LOG.warning( - f"Read queue returned {len(updates)} updates" - ) # Changed to warning - if updates: - LOG.warning( - f"Received {len(updates)} updates for processing." - ) # Changed to warning - - df = pd.DataFrame(updates) - - self._transform_columns(df) - self._validate_columns(df) - - await self._write_batch(df) + try: + async with self.read_queue() as updates: + LOG.warning(f"Read queue returned {len(updates)} updates") + if updates: + LOG.warning(f"Received {len(updates)} updates for processing.") + df = pd.DataFrame(updates) + LOG.warning(f"Created DataFrame with shape: {df.shape}") + + LOG.warning("Starting field transformation") + self._transform_fields(df) + LOG.warning("Field transformation completed") + + LOG.warning("Validating columns") + self._validate_columns(df) + LOG.warning("Columns validation completed") + + LOG.warning("Starting batch write") + await self._write_batch(df) + LOG.warning("Batch write completed") + else: + LOG.warning("No updates received, continuing loop") + except Exception as e: + LOG.error(f"Error in writer method: {e}", exc_info=True) + LOG.warning("Writer method ended") def _validate_columns(self, df: pd.DataFrame): LOG.debug("Validating DataFrame columns.") From 4946bb843e520e9d1d1c385f5ea3fbc6760ce956 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 04:59:06 +0200 Subject: [PATCH 64/87] fix: Refactor field transformation in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 81f6f34b8..0ce9b9116 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -13,23 +13,9 @@ import pandas as pd from deltalake import DeltaTable, write_deltalake -from cryptofeed.backends.backend import ( - BackendBookCallback, - BackendCallback, - BackendQueue, -) -from cryptofeed.defines import ( - BALANCES, - CANDLES, - FILLS, - FUNDING, - LIQUIDATIONS, - OPEN_INTEREST, - ORDER_INFO, - TICKER, - TRADES, - TRANSACTIONS, -) +from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue +from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS, + OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS) # Add these lines after the imports @@ -134,7 +120,7 @@ async def writer(self): LOG.warning(f"Created DataFrame with shape: {df.shape}") LOG.warning("Starting field transformation") - self._transform_fields(df) + self._transform_columns(df) LOG.warning("Field transformation completed") LOG.warning("Validating columns") From b1cad2b37250a68b3a3b6887f6d011f577ecd24c Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 05:15:11 +0200 Subject: [PATCH 65/87] fix: Reorder columns to prioritize exchange and symbol --- cryptofeed/backends/deltalake.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 0ce9b9116..314bae1bf 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -62,7 +62,6 @@ def __init__( self._convert_int_columns, self._ensure_partition_columns, self._handle_missing_values, - self._reorder_columns, ] if custom_transformations: self.transformations.extend(custom_transformations) @@ -179,10 +178,9 @@ def _rename_custom_columns(self, df: pd.DataFrame): def _reorder_columns(self, df: pd.DataFrame): LOG.debug("Reordering columns to prioritize exchange and symbol.") - cols = ["exchange", "symbol"] + [ - col for col in df.columns if col not in ["exchange", "symbol"] - ] - df.reindex(columns=cols, inplace=True) + priority_cols = ["exchange", "symbol"] + other_cols = [col for col in df.columns if col not in priority_cols] + df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): LOG.debug("Converting datetime columns.") From 6cfd5f71f88e4ab7acfc76ccf15ea1b360af7afd Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 05:22:38 +0200 Subject: [PATCH 66/87] fix: Ensure datetime columns have millisecond precision in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 314bae1bf..ee7027884 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -155,14 +155,18 @@ def _validate_columns(self, df: pd.DataFrame): "exchange": "object", "symbol": "object", "dt": "object", - "timestamp": "datetime64[ms]", - "receipt_timestamp": "datetime64[ms]", + "timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' + "receipt_timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' } for col, expected_type in expected_types.items(): - if col in df.columns and not df[col].dtype == expected_type: - raise TypeError( - f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" - ) + if col in df.columns: + if expected_type.startswith("datetime64"): + # Convert to millisecond precision if it's a datetime column + df[col] = df[col].astype('datetime64[ms]') + if not df[col].dtype == expected_type: + raise TypeError( + f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" + ) LOG.debug("DataFrame columns validation completed successfully.") @@ -187,7 +191,8 @@ def _convert_datetime_columns(self, df: pd.DataFrame): datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - df[col] = pd.to_datetime(df[col], unit="ms") + # Convert to millisecond precision + df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]') # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' if "timestamp" in df.columns: From d213cf00fe832e1c58577fd28de09dc0c515c05f Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 3 Sep 2024 05:39:57 +0200 Subject: [PATCH 67/87] feat: Ensure datetime columns are in millisecond precision --- cryptofeed/backends/deltalake.py | 95 ++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 35 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index ee7027884..f3b8eff50 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame): "exchange": "object", "symbol": "object", "dt": "object", - "timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' - "receipt_timestamp": "datetime64[ms]", # Keep as 'datetime64[ms]' + "timestamp": "datetime64[ms]", + "receipt_timestamp": "datetime64[ms]", } for col, expected_type in expected_types.items(): if col in df.columns: - if expected_type.startswith("datetime64"): - # Convert to millisecond precision if it's a datetime column - df[col] = df[col].astype('datetime64[ms]') + if expected_type == "datetime64[ms]": + # Ensure datetime columns are in millisecond precision + df[col] = df[col].astype("datetime64[ms]") if not df[col].dtype == expected_type: raise TypeError( f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" @@ -187,22 +187,47 @@ def _reorder_columns(self, df: pd.DataFrame): df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): - LOG.debug("Converting datetime columns.") + LOG.debug("Converting datetime columns to millisecond precision.") datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: - # Convert to millisecond precision - df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]') + # Log sample of original values + LOG.warning( + f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" + ) + # Convert to millisecond precision, handling both string and datetime inputs + df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]") + # Log sample of converted values in readable format + if len(df) > 0: + readable_time = ( + df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + ) + LOG.warning(f"Sample {col} after conversion: {readable_time}") # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' + min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed if "timestamp" in df.columns: - df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d") + df["dt"] = ( + df["timestamp"] + .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()) + .dt.strftime("%Y-%m-%d") + ) elif "receipt_timestamp" in df.columns: - df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d") + df["dt"] = ( + df["receipt_timestamp"] + .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()) + .dt.strftime("%Y-%m-%d") + ) else: LOG.warning("No timestamp column found. Using current date for 'dt'.") df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d") + # Log sample of 'dt' column + if "dt" in df.columns and len(df) > 0: + LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}") + + LOG.debug("Datetime columns converted to millisecond precision.") + def _convert_int_columns(self, df: pd.DataFrame): LOG.debug("Converting integer columns.") int_columns = ["id", "trade_id", "trades"] @@ -363,8 +388,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRADES """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -381,14 +406,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FUNDING """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category - mark_price: float64 (nullable) - rate: float64 - - next_funding_time: datetime64[ns] (nullable) + - next_funding_time: datetime64[ms] (nullable) - predicted_rate: float64 (nullable) """ @@ -397,8 +422,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TICKER """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -411,8 +436,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): default_key = OPEN_INTEREST """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -424,8 +449,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = LIQUIDATIONS """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -441,8 +466,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): default_key = "book" """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -461,13 +486,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category - - start: datetime64[ns] - - stop: datetime64[ns] + - start: datetime64[ms] + - stop: datetime64[ms] - interval: string - trades: int64 (nullable) - open: float64 @@ -483,8 +508,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): default_key = ORDER_INFO """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category @@ -504,8 +529,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRANSACTIONS """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - currency: category @@ -519,8 +544,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = BALANCES """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - currency: category @@ -533,8 +558,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FILLS """ Schema: - - timestamp: datetime64[ns] (from 'date' column) - - receipt_timestamp: datetime64[ns] + - timestamp: datetime64[ms] (from 'date' column) + - receipt_timestamp: datetime64[ms] - dt: string - exchange: category - symbol: category From adac33bc4f8bd4dd71811c907cf472405e90d455 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 00:42:34 +0200 Subject: [PATCH 68/87] chore: Convert datetime columns to microsecond precision --- .gitignore | 1 + cryptofeed/backends/deltalake.py | 72 ++++++++++++++++---------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index ac64f2b9e..ed13eb5fa 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ ENV/ # PyCharm .idea/ .aider* +.trunk/ diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index f3b8eff50..af5d6b31e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame): "exchange": "object", "symbol": "object", "dt": "object", - "timestamp": "datetime64[ms]", - "receipt_timestamp": "datetime64[ms]", + "timestamp": "datetime64[us]", + "receipt_timestamp": "datetime64[us]", } for col, expected_type in expected_types.items(): if col in df.columns: - if expected_type == "datetime64[ms]": - # Ensure datetime columns are in millisecond precision - df[col] = df[col].astype("datetime64[ms]") + if expected_type == "datetime64[us]": + # Ensure datetime columns are in microsecond precision + df[col] = df[col].astype("datetime64[us]") if not df[col].dtype == expected_type: raise TypeError( f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" @@ -187,7 +187,7 @@ def _reorder_columns(self, df: pd.DataFrame): df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): - LOG.debug("Converting datetime columns to millisecond precision.") + LOG.debug("Converting datetime columns to microsecond precision.") datetime_columns = ["timestamp", "receipt_timestamp"] for col in datetime_columns: if col in df.columns: @@ -195,13 +195,11 @@ def _convert_datetime_columns(self, df: pd.DataFrame): LOG.warning( f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" ) - # Convert to millisecond precision, handling both string and datetime inputs - df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]") + # Convert to microsecond precision, handling both string and datetime inputs + df[col] = pd.to_datetime(df[col]).astype("datetime64[us]") # Log sample of converted values in readable format if len(df) > 0: - readable_time = ( - df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] - ) + readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f") LOG.warning(f"Sample {col} after conversion: {readable_time}") # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' @@ -226,7 +224,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): if "dt" in df.columns and len(df) > 0: LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}") - LOG.debug("Datetime columns converted to millisecond precision.") + LOG.debug("Datetime columns converted to microsecond precision.") def _convert_int_columns(self, df: pd.DataFrame): LOG.debug("Converting integer columns.") @@ -388,8 +386,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRADES """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -406,14 +404,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FUNDING """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category - mark_price: float64 (nullable) - rate: float64 - - next_funding_time: datetime64[ms] (nullable) + - next_funding_time: datetime64[us] (nullable) - predicted_rate: float64 (nullable) """ @@ -422,8 +420,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TICKER """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -436,8 +434,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): default_key = OPEN_INTEREST """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -449,8 +447,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = LIQUIDATIONS """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -466,8 +464,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): default_key = "book" """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -486,13 +484,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = CANDLES """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category - - start: datetime64[ms] - - stop: datetime64[ms] + - start: datetime64[us] + - stop: datetime64[us] - interval: string - trades: int64 (nullable) - open: float64 @@ -508,8 +506,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): default_key = ORDER_INFO """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category @@ -529,8 +527,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = TRANSACTIONS """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - currency: category @@ -544,8 +542,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): default_key = BALANCES """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - currency: category @@ -558,8 +556,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): default_key = FILLS """ Schema: - - timestamp: datetime64[ms] (from 'date' column) - - receipt_timestamp: datetime64[ms] + - timestamp: datetime64[us] (from 'date' column) + - receipt_timestamp: datetime64[us] - dt: string - exchange: category - symbol: category From 4e2aff6dda10c0248f71a28e3ca5be88659cafc8 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 01:04:50 +0200 Subject: [PATCH 69/87] fix: Change log levels from warning to debug for non-critical messages --- cryptofeed/backends/deltalake.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index af5d6b31e..412794626 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -41,7 +41,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): - LOG.warning("Initializing DeltaLakeCallback") # Changed to warning + LOG.warning("Initializing DeltaLakeCallback") super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -192,7 +192,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): for col in datetime_columns: if col in df.columns: # Log sample of original values - LOG.warning( + LOG.debug( f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" ) # Convert to microsecond precision, handling both string and datetime inputs @@ -200,7 +200,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): # Log sample of converted values in readable format if len(df) > 0: readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f") - LOG.warning(f"Sample {col} after conversion: {readable_time}") + LOG.debug(f"Sample {col} after conversion: {readable_time}") # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed @@ -222,7 +222,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): # Log sample of 'dt' column if "dt" in df.columns and len(df) > 0: - LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}") + LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}") LOG.debug("Datetime columns converted to microsecond precision.") @@ -293,7 +293,7 @@ def _handle_missing_values(self, df: pd.DataFrame): async def _write_batch(self, df: pd.DataFrame): LOG.warning( f"_write_batch called with DataFrame of shape {df.shape}" - ) # Changed to warning + ) if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return @@ -305,12 +305,12 @@ async def _write_batch(self, df: pd.DataFrame): try: LOG.warning( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." - ) # Changed to warning - LOG.warning(f"DataFrame schema:\n{df.dtypes}") # Changed to warning + ) + LOG.debug(f"DataFrame schema:\n{df.dtypes}") LOG.warning( f"Writing batch of {len(df)} records to {self.delta_table_path}" - ) # Changed to warning + ) write_deltalake( self.delta_table_path, @@ -328,7 +328,7 @@ async def _write_batch(self, df: pd.DataFrame): if self.time_travel: self._update_metadata() - LOG.warning("Batch write successful.") # Changed to warning + LOG.warning("Batch write successful.") break # Exit the retry loop if write is successful except Exception as e: @@ -341,7 +341,7 @@ async def _write_batch(self, df: pd.DataFrame): if attempt < max_retries - 1: LOG.warning( f"Retrying in {retry_delay} seconds..." - ) # Changed to warning + ) await asyncio.sleep(retry_delay) else: LOG.error( @@ -351,12 +351,12 @@ async def _write_batch(self, df: pd.DataFrame): async def _optimize_table(self): LOG.warning( f"Running OPTIMIZE on table {self.delta_table_path}" - ) # Changed to warning + ) dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) - LOG.warning("OPTIMIZE operation completed.") # Changed to warning + LOG.warning("OPTIMIZE operation completed.") def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From 99b3a366bc9f50d96d4d84d1eece4da146031536 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 01:25:25 +0200 Subject: [PATCH 70/87] refactor: Simplify datetime column handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 40 +++++++++++++------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 412794626..d11c3c596 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -205,20 +205,12 @@ def _convert_datetime_columns(self, df: pd.DataFrame): # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed if "timestamp" in df.columns: - df["dt"] = ( - df["timestamp"] - .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()) - .dt.strftime("%Y-%m-%d") - ) + df["dt"] = df["timestamp"].where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date elif "receipt_timestamp" in df.columns: - df["dt"] = ( - df["receipt_timestamp"] - .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()) - .dt.strftime("%Y-%m-%d") - ) + df["dt"] = df["receipt_timestamp"].where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date else: LOG.warning("No timestamp column found. Using current date for 'dt'.") - df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d") + df["dt"] = pd.Timestamp.now().date() # Log sample of 'dt' column if "dt" in df.columns and len(df) > 0: @@ -244,7 +236,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame): elif col == "dt": # 'dt' should already be created in _convert_datetime_columns LOG.warning("'dt' column not found. This should not happen.") - df[col] = pd.Timestamp.now().strftime("%Y-%m-%d") + df[col] = pd.Timestamp.now().date() else: df[col] = "unknown" @@ -256,7 +248,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame): df[col] = df[col].fillna( "unknown" if col != "dt" - else pd.Timestamp.now().strftime("%Y-%m-%d") + else pd.Timestamp.now().date() ) def _handle_missing_values(self, df: pd.DataFrame): @@ -388,7 +380,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - id: int64 (nullable) @@ -406,7 +398,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - mark_price: float64 (nullable) @@ -422,7 +414,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - bid: float64 @@ -436,7 +428,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - open_interest: float64 @@ -449,7 +441,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - side: category @@ -466,7 +458,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - delta: dict (nullable, contains 'bid' and 'ask' updates) @@ -486,7 +478,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - start: datetime64[us] @@ -508,7 +500,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - id: int64 @@ -529,7 +521,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - currency: category - type: category @@ -544,7 +536,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - currency: category - balance: float64 @@ -558,7 +550,7 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): Schema: - timestamp: datetime64[us] (from 'date' column) - receipt_timestamp: datetime64[us] - - dt: string + - dt: date - exchange: category - symbol: category - price: float64 From 1b25acd9c27ebb9bc765fcdc744fba20f4bb15f8 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 02:05:38 +0200 Subject: [PATCH 71/87] feat: Add batch processing and flush interval to DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 53 +++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d11c3c596..5bfa7bd24 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -7,6 +7,7 @@ import asyncio import logging +import time from collections import defaultdict from typing import Any, Dict, List, Optional, Union @@ -38,6 +39,8 @@ def __init__( storage_options: Optional[Dict[str, Any]] = None, numeric_type: Union[type, str] = float, none_to: Any = None, + batch_size: int = 1000, + flush_interval: float = 60.0, custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): @@ -67,6 +70,10 @@ def __init__( self.transformations.extend(custom_transformations) # Validate configuration parameters self._validate_configuration() + self.batch_size = batch_size + self.flush_interval = flush_interval + self.batch = [] + self.last_flush_time = time.time() def _validate_configuration(self): if self.optimize_interval <= 0: @@ -115,26 +122,41 @@ async def writer(self): LOG.warning(f"Read queue returned {len(updates)} updates") if updates: LOG.warning(f"Received {len(updates)} updates for processing.") - df = pd.DataFrame(updates) - LOG.warning(f"Created DataFrame with shape: {df.shape}") + self.batch.extend(updates) - LOG.warning("Starting field transformation") - self._transform_columns(df) - LOG.warning("Field transformation completed") - - LOG.warning("Validating columns") - self._validate_columns(df) - LOG.warning("Columns validation completed") - - LOG.warning("Starting batch write") - await self._write_batch(df) - LOG.warning("Batch write completed") + if len(self.batch) >= self.batch_size or (time.time() - self.last_flush_time) >= self.flush_interval: + await self._process_batch() else: - LOG.warning("No updates received, continuing loop") + # Check if we need to flush based on time + if (time.time() - self.last_flush_time) >= self.flush_interval and self.batch: + await self._process_batch() + else: + LOG.warning("No updates received, continuing loop") + await asyncio.sleep(1) # Add a small delay to prevent busy-waiting except Exception as e: LOG.error(f"Error in writer method: {e}", exc_info=True) LOG.warning("Writer method ended") + async def _process_batch(self): + LOG.warning(f"Processing batch of {len(self.batch)} updates") + df = pd.DataFrame(self.batch) + LOG.warning(f"Created DataFrame with shape: {df.shape}") + + LOG.warning("Starting field transformation") + self._transform_columns(df) + LOG.warning("Field transformation completed") + + LOG.warning("Validating columns") + self._validate_columns(df) + LOG.warning("Columns validation completed") + + LOG.warning("Starting batch write") + await self._write_batch(df) + LOG.warning("Batch write completed") + + self.batch = [] + self.last_flush_time = time.time() + def _validate_columns(self, df: pd.DataFrame): LOG.debug("Validating DataFrame columns.") # Check for required columns @@ -357,6 +379,9 @@ def _update_metadata(self): async def stop(self): LOG.info("Stopping DeltaLakeCallback writer.") self.running = False + # Flush any remaining data + if self.batch: + await self._process_batch() def get_version(self, timestamp: Optional[int] = None) -> Optional[int]: if self.time_travel: From d649fea58743fd152b144c0ef5fb1e138925c5ec Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 02:25:43 +0200 Subject: [PATCH 72/87] fix: Update copyright year in demo_deltalake.py --- examples/demo_deltalake.py | 4 ++-- setup.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py index 5c973016e..95d2a405d 100644 --- a/examples/demo_deltalake.py +++ b/examples/demo_deltalake.py @@ -1,9 +1,9 @@ -''' +""" Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com Please see the LICENSE file for the terms and conditions associated with this software. -''' +""" from cryptofeed import FeedHandler from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake from cryptofeed.defines import FUNDING, TICKER, TRADES diff --git a/setup.py b/setup.py index 344573348..fe38f7b7d 100644 --- a/setup.py +++ b/setup.py @@ -7,10 +7,9 @@ import os import sys -from setuptools import Extension, setup -from setuptools import find_packages -from setuptools.command.test import test as TestCommand from Cython.Build import cythonize +from setuptools import Extension, find_packages, setup +from setuptools.command.test import test as TestCommand def get_long_description(): From bb4cb4cbe18431e95a020b494c72bf6becd3c22a Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 03:19:11 +0200 Subject: [PATCH 73/87] feat: Add common configuration for Delta Lake callbacks --- examples/demo_deltalake.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py index 95d2a405d..d258cb21e 100644 --- a/examples/demo_deltalake.py +++ b/examples/demo_deltalake.py @@ -23,26 +23,33 @@ def main(): "AWS_REGION": "your_region" } + # Common configuration for all callbacks + common_config = { + "base_path": delta_base_path, + "storage_options": s3_options, + "batch_size": 1000, # Process in batches of 1000 records + "flush_interval": 60.0, # Flush every 60 seconds if batch size not reached + "optimize_interval": 100000, # Optimize after 100,000 rows written + "time_travel": True, + } + # Add Binance feed with Delta Lake callbacks f.add_feed(Binance( channels=[TRADES, FUNDING, TICKER], symbols=['BTC-USDT', 'ETH-USDT'], callbacks={ TRADES: TradeDeltaLake( - base_path=delta_base_path, - optimize_interval=50, # More frequent table optimization - time_travel=True, # Enable time travel feature - storage_options=s3_options # Add S3 configuration + **common_config, + z_order_cols=['timestamp', 'price', 'amount'] ), FUNDING: FundingDeltaLake( - base_path=delta_base_path, - storage_options=s3_options # Add S3 configuration + **common_config, + z_order_cols=['timestamp', 'rate'] ), TICKER: TickerDeltaLake( - base_path=delta_base_path, - partition_cols=['exchange', 'symbol', 'year', 'month', 'day'], # Custom partitioning - z_order_cols=['timestamp', 'bid', 'ask'], # Enable Z-ordering - storage_options=s3_options # Add S3 configuration + **common_config, + partition_cols=['exchange', 'symbol', 'dt'], + z_order_cols=['timestamp', 'bid', 'ask'] ) } )) From 87a77f52e15ec32f75afcf0ad62aeb8177d6846d Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:42:39 +0200 Subject: [PATCH 74/87] fix: Fix logging message in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 5bfa7bd24..97ad9d80a 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -119,7 +119,7 @@ async def writer(self): while self.running: try: async with self.read_queue() as updates: - LOG.warning(f"Read queue returned {len(updates)} updates") + LOG.warning(f"Read queue returned: {updates}") if updates: LOG.warning(f"Received {len(updates)} updates for processing.") self.batch.extend(updates) From 98762fdaf32309abca96926c5f2619221b94b378 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:56:33 +0200 Subject: [PATCH 75/87] refactor: Improve logging and error handling in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 97ad9d80a..c1980eb7d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -320,11 +320,14 @@ async def _write_batch(self, df: pd.DataFrame): LOG.warning( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." ) - LOG.debug(f"DataFrame schema:\n{df.dtypes}") - LOG.warning( - f"Writing batch of {len(df)} records to {self.delta_table_path}" - ) + # Moved logging statements here, just before write_deltalake + sample_size = min(5, len(df)) # Show up to 5 rows + LOG.warning(f"Sample of DataFrame to be written (first {sample_size} rows):") + LOG.warning(df.head(sample_size).to_string()) + LOG.warning("DataFrame dtypes:") + LOG.warning(df.dtypes.to_string()) + LOG.warning(f"Writing batch of {len(df)} records to {self.delta_table_path}") write_deltalake( self.delta_table_path, @@ -346,16 +349,12 @@ async def _write_batch(self, df: pd.DataFrame): break # Exit the retry loop if write is successful except Exception as e: + LOG.error(f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}") LOG.error(f"DataFrame schema:\n{df.dtypes}") LOG.error(f"DataFrame:\n{df}") - LOG.error( - f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}" - ) if attempt < max_retries - 1: - LOG.warning( - f"Retrying in {retry_delay} seconds..." - ) + LOG.warning(f"Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) else: LOG.error( From 53a87c793cda6970bff6ffc24592fc9c5f19661c Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 18:14:37 +0200 Subject: [PATCH 76/87] refactor: Optimize DeltaLakeCallback batch processing --- cryptofeed/backends/deltalake.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index c1980eb7d..405f4d264 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -39,8 +39,8 @@ def __init__( storage_options: Optional[Dict[str, Any]] = None, numeric_type: Union[type, str] = float, none_to: Any = None, - batch_size: int = 1000, - flush_interval: float = 60.0, + batch_size: int = 100, + flush_interval: float = 10.0, custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): @@ -138,21 +138,10 @@ async def writer(self): LOG.warning("Writer method ended") async def _process_batch(self): - LOG.warning(f"Processing batch of {len(self.batch)} updates") df = pd.DataFrame(self.batch) - LOG.warning(f"Created DataFrame with shape: {df.shape}") - - LOG.warning("Starting field transformation") self._transform_columns(df) - LOG.warning("Field transformation completed") - - LOG.warning("Validating columns") self._validate_columns(df) - LOG.warning("Columns validation completed") - - LOG.warning("Starting batch write") await self._write_batch(df) - LOG.warning("Batch write completed") self.batch = [] self.last_flush_time = time.time() @@ -305,9 +294,6 @@ def _handle_missing_values(self, df: pd.DataFrame): ) async def _write_batch(self, df: pd.DataFrame): - LOG.warning( - f"_write_batch called with DataFrame of shape {df.shape}" - ) if df.empty: LOG.warning("DataFrame is empty. Skipping write operation.") return @@ -586,4 +572,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): - liquidity: category - type: category - account: string (nullable) - """ + """ \ No newline at end of file From acfd7ccac9e2ec2c9f09355896ba14689da76588 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:26:08 +0200 Subject: [PATCH 77/87] fix: Convert datetime columns to UTC and microsecond precision --- cryptofeed/backends/deltalake.py | 38 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 405f4d264..d3c1b41da 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -11,6 +11,7 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Union +import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake @@ -198,37 +199,30 @@ def _reorder_columns(self, df: pd.DataFrame): df = df[priority_cols + other_cols] def _convert_datetime_columns(self, df: pd.DataFrame): - LOG.debug("Converting datetime columns to microsecond precision.") - datetime_columns = ["timestamp", "receipt_timestamp"] - for col in datetime_columns: + LOG.debug("Converting datetime columns to UTC and microsecond precision.") + INVALID_DATE = np.Timestamp('1900-01-01').date() + + for col in ['timestamp', 'receipt_timestamp']: if col in df.columns: - # Log sample of original values - LOG.debug( - f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}" - ) - # Convert to microsecond precision, handling both string and datetime inputs - df[col] = pd.to_datetime(df[col]).astype("datetime64[us]") - # Log sample of converted values in readable format - if len(df) > 0: - readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f") - LOG.debug(f"Sample {col} after conversion: {readable_time}") - - # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp' - min_valid_date = pd.Timestamp("2000-01-01") # Adjust this as needed + # Convert timestamp (seconds since epoch) to UTC datetime + df[col] = pd.to_datetime(df[col], unit='s', utc=True) + df[col] = df[col].dt.tz_localize(None) # Remove timezone info after conversion + LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}") + + # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE if "timestamp" in df.columns: - df["dt"] = df["timestamp"].where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date + df["dt"] = df["timestamp"].dt.date elif "receipt_timestamp" in df.columns: - df["dt"] = df["receipt_timestamp"].where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date + df["dt"] = df["receipt_timestamp"].dt.date else: - LOG.warning("No timestamp column found. Using current date for 'dt'.") - df["dt"] = pd.Timestamp.now().date() + LOG.warning("Neither timestamp nor receipt_timestamp column found. Using invalid date for 'dt'.") + df["dt"] = INVALID_DATE # Log sample of 'dt' column if "dt" in df.columns and len(df) > 0: LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}") - LOG.debug("Datetime columns converted to microsecond precision.") - + LOG.debug("Datetime columns converted and 'dt' column created.") def _convert_int_columns(self, df: pd.DataFrame): LOG.debug("Converting integer columns.") int_columns = ["id", "trade_id", "trades"] From b600513ae691d6c2bbed1f32e5a51d64217e61e5 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:33:51 +0200 Subject: [PATCH 78/87] feat: Convert datetime columns to UTC and microsecond precision --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index d3c1b41da..936b8c58f 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -200,7 +200,7 @@ def _reorder_columns(self, df: pd.DataFrame): def _convert_datetime_columns(self, df: pd.DataFrame): LOG.debug("Converting datetime columns to UTC and microsecond precision.") - INVALID_DATE = np.Timestamp('1900-01-01').date() + INVALID_DATE = np.datetime64('1900-01-01').date() for col in ['timestamp', 'receipt_timestamp']: if col in df.columns: From fae08d4b5ababb3ccc505a35f7115b9aeeb76b59 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 19:37:29 +0200 Subject: [PATCH 79/87] feat: Convert datetime columns to UTC and microsecond precision --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 936b8c58f..504b7936e 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -200,7 +200,7 @@ def _reorder_columns(self, df: pd.DataFrame): def _convert_datetime_columns(self, df: pd.DataFrame): LOG.debug("Converting datetime columns to UTC and microsecond precision.") - INVALID_DATE = np.datetime64('1900-01-01').date() + INVALID_DATE = pd.Timestamp('1900-01-01').date() for col in ['timestamp', 'receipt_timestamp']: if col in df.columns: From b313e5e9a6291149f4959347c26d2a7d95c81f95 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:11:03 +0200 Subject: [PATCH 80/87] feat: Increase optimize_interval, enable time_travel, change numeric_type and batch_size in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 504b7936e..b2fbe8b95 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -34,13 +34,13 @@ def __init__( key: Optional[str] = None, custom_columns: Optional[Dict[str, str]] = None, partition_cols: Optional[List[str]] = None, - optimize_interval: int = 100, + optimize_interval: int = 1000, z_order_cols: Optional[List[str]] = None, - time_travel: bool = False, + time_travel: bool = True, storage_options: Optional[Dict[str, Any]] = None, - numeric_type: Union[type, str] = float, + numeric_type: Union[type, str] = "float64", none_to: Any = None, - batch_size: int = 100, + batch_size: int = 10000, flush_interval: float = 10.0, custom_transformations: Optional[List[callable]] = None, **kwargs: Any, @@ -54,7 +54,7 @@ def __init__( self.partition_cols = partition_cols or ["exchange", "symbol", "dt"] self.optimize_interval = optimize_interval self.z_order_cols = z_order_cols or self._default_z_order_cols() - self.time_travel = time_travel + self.time_travel = time_travel or False self.storage_options = storage_options or {} self.write_count = 0 self.running = True From 2261007c61b6e241984797f2ff3d75aeaaeb6691 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:42:31 +0200 Subject: [PATCH 81/87] fix: Ensure datetime columns are in microsecond precision and remove timezone info after conversion --- cryptofeed/backends/deltalake.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index b2fbe8b95..bfe2a7e7a 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -172,9 +172,6 @@ def _validate_columns(self, df: pd.DataFrame): } for col, expected_type in expected_types.items(): if col in df.columns: - if expected_type == "datetime64[us]": - # Ensure datetime columns are in microsecond precision - df[col] = df[col].astype("datetime64[us]") if not df[col].dtype == expected_type: raise TypeError( f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}" @@ -205,8 +202,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): for col in ['timestamp', 'receipt_timestamp']: if col in df.columns: # Convert timestamp (seconds since epoch) to UTC datetime - df[col] = pd.to_datetime(df[col], unit='s', utc=True) - df[col] = df[col].dt.tz_localize(None) # Remove timezone info after conversion + df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None) LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}") # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE From 1da540f52c4740a993779fbca33d7e0ca8b2d59a Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:47:52 +0200 Subject: [PATCH 82/87] feat: Change numeric_type to float in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index bfe2a7e7a..3b4a6a46d 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -38,7 +38,7 @@ def __init__( z_order_cols: Optional[List[str]] = None, time_travel: bool = True, storage_options: Optional[Dict[str, Any]] = None, - numeric_type: Union[type, str] = "float64", + numeric_type: Union[type, str] = float, none_to: Any = None, batch_size: int = 10000, flush_interval: float = 10.0, From 3d1a84cc7ad6e32379b258c9792fcd25ac0f59a4 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 21:01:56 +0200 Subject: [PATCH 83/87] fix: Ensure datetime columns are in microsecond precision and convert timestamp columns to UTC datetime --- cryptofeed/backends/deltalake.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 3b4a6a46d..2a9364fc0 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -171,6 +171,9 @@ def _validate_columns(self, df: pd.DataFrame): "receipt_timestamp": "datetime64[us]", } for col, expected_type in expected_types.items(): + if expected_type == "datetime64[us]": + # Ensure datetime columns are in microsecond precision + df[col] = df[col].astype("datetime64[us]") if col in df.columns: if not df[col].dtype == expected_type: raise TypeError( @@ -202,7 +205,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): for col in ['timestamp', 'receipt_timestamp']: if col in df.columns: # Convert timestamp (seconds since epoch) to UTC datetime - df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None) + df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None).astype("datetime64[us, UTC]") LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}") # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE From fabc822f9a7c143331d58b4173de873068e6892b Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 21:10:31 +0200 Subject: [PATCH 84/87] fix: Convert timestamp columns to datetime64[ns] instead of datetime64[us, UTC] --- cryptofeed/backends/deltalake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 2a9364fc0..1ee3bd623 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -205,7 +205,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): for col in ['timestamp', 'receipt_timestamp']: if col in df.columns: # Convert timestamp (seconds since epoch) to UTC datetime - df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None).astype("datetime64[us, UTC]") + df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None) LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}") # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE From 47ca6a4be0c068d092002ce12232758da6c02de8 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Mon, 9 Sep 2024 23:02:01 +0200 Subject: [PATCH 85/87] fix: Update logging levels in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 1ee3bd623..0022e9b98 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -11,7 +11,6 @@ from collections import defaultdict from typing import Any, Dict, List, Optional, Union -import numpy as np import pandas as pd from deltalake import DeltaTable, write_deltalake @@ -45,7 +44,7 @@ def __init__( custom_transformations: Optional[List[callable]] = None, **kwargs: Any, ): - LOG.warning("Initializing DeltaLakeCallback") + LOG.debug("Initializing DeltaLakeCallback") super().__init__() self.key = key or self.default_key self.base_path = base_path @@ -116,13 +115,13 @@ def _default_z_order_cols(self) -> List[str]: return [col for col in z_order_cols if col not in self.partition_cols] async def writer(self): - LOG.warning("Writer method started") + LOG.debug("Writer method started") while self.running: try: async with self.read_queue() as updates: - LOG.warning(f"Read queue returned: {updates}") + LOG.debug(f"Read queue returned: {updates}") if updates: - LOG.warning(f"Received {len(updates)} updates for processing.") + LOG.debug(f"Received {len(updates)} updates for processing.") self.batch.extend(updates) if len(self.batch) >= self.batch_size or (time.time() - self.last_flush_time) >= self.flush_interval: @@ -132,11 +131,11 @@ async def writer(self): if (time.time() - self.last_flush_time) >= self.flush_interval and self.batch: await self._process_batch() else: - LOG.warning("No updates received, continuing loop") + LOG.debug("No updates received, continuing loop") await asyncio.sleep(1) # Add a small delay to prevent busy-waiting except Exception as e: LOG.error(f"Error in writer method: {e}", exc_info=True) - LOG.warning("Writer method ended") + LOG.debug("Writer method ended") async def _process_batch(self): df = pd.DataFrame(self.batch) @@ -296,16 +295,16 @@ async def _write_batch(self, df: pd.DataFrame): for attempt in range(max_retries): try: - LOG.warning( + LOG.debug( f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})." ) - # Moved logging statements here, just before write_deltalake + # Logging statements just before write_deltalake sample_size = min(5, len(df)) # Show up to 5 rows - LOG.warning(f"Sample of DataFrame to be written (first {sample_size} rows):") - LOG.warning(df.head(sample_size).to_string()) - LOG.warning("DataFrame dtypes:") - LOG.warning(df.dtypes.to_string()) + LOG.debug(f"Sample of DataFrame to be written (first {sample_size} rows):") + LOG.debug(df.head(sample_size).to_string()) + LOG.debug("DataFrame dtypes:") + LOG.debug(df.dtypes.to_string()) LOG.warning(f"Writing batch of {len(df)} records to {self.delta_table_path}") write_deltalake( @@ -341,14 +340,14 @@ async def _write_batch(self, df: pd.DataFrame): ) async def _optimize_table(self): - LOG.warning( + LOG.debug( f"Running OPTIMIZE on table {self.delta_table_path}" ) dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) dt.optimize.compact() if self.z_order_cols: dt.optimize.z_order(self.z_order_cols) - LOG.warning("OPTIMIZE operation completed.") + LOG.debug("OPTIMIZE operation completed.") def _update_metadata(self): dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options) From cf0004a24d887a86c27364f01145858053033eb0 Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 10 Sep 2024 00:51:33 +0200 Subject: [PATCH 86/87] fix: Remove unnecessary logging statements in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index 0022e9b98..fea97730f 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -300,9 +300,9 @@ async def _write_batch(self, df: pd.DataFrame): ) # Logging statements just before write_deltalake - sample_size = min(5, len(df)) # Show up to 5 rows - LOG.debug(f"Sample of DataFrame to be written (first {sample_size} rows):") - LOG.debug(df.head(sample_size).to_string()) + # sample_size = min(5, len(df)) # Show up to 5 rows + # LOG.debug(f"Sample of DataFrame to be written (first {sample_size} rows):") + # LOG.debug(df.head(sample_size).to_string()) LOG.debug("DataFrame dtypes:") LOG.debug(df.dtypes.to_string()) LOG.warning(f"Writing batch of {len(df)} records to {self.delta_table_path}") @@ -564,4 +564,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback): - liquidity: category - type: category - account: string (nullable) - """ \ No newline at end of file + """ From c4d7b239b9012e864d6ce4f30c63a785197ec99a Mon Sep 17 00:00:00 2001 From: Tommy K <140900186+tommy-ca@users.noreply.github.com> Date: Tue, 10 Sep 2024 01:16:17 +0200 Subject: [PATCH 87/87] fix: Convert integer columns in DeltaLakeCallback --- cryptofeed/backends/deltalake.py | 1 + examples/demo_deltalake.py | 6 +++--- setup.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py index fea97730f..7ebe520ae 100644 --- a/cryptofeed/backends/deltalake.py +++ b/cryptofeed/backends/deltalake.py @@ -221,6 +221,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame): LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}") LOG.debug("Datetime columns converted and 'dt' column created.") + def _convert_int_columns(self, df: pd.DataFrame): LOG.debug("Converting integer columns.") int_columns = ["id", "trade_id", "trades"] diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py index d258cb21e..8d3c46cbd 100644 --- a/examples/demo_deltalake.py +++ b/examples/demo_deltalake.py @@ -12,7 +12,7 @@ def main(): f = FeedHandler() - + # Define the Delta Lake base path (can be local or S3) delta_base_path = 's3://your-bucket/path/to/delta/tables' @@ -53,9 +53,9 @@ def main(): ) } )) - + f.run() if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/setup.py b/setup.py index fe38f7b7d..0542ef42c 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,9 @@ -''' +""" Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com Please see the LICENSE file for the terms and conditions associated with this software. -''' +""" import os import sys