From 07cca1759f70391579f597111e482999d81f7c62 Mon Sep 17 00:00:00 2001
From: Tommy K <tommy.k.devops@gmail.com>
Date: Sat, 31 Aug 2024 23:37:59 +0200
Subject: [PATCH 01/87] feat(deltalake): Implement Delta Lake backend and add
 dependencies

- Add DeltaLakeCallback class with support for various data types
- Implement partitioning, Z-ordering, and time travel features
- Add schema documentation for each data type
- Include Delta Lake dependencies in setup.py
- Create demo file for Delta Lake usage with S3 configuration
- Update extras_require in setup.py to include deltalake option
---
 cryptofeed/backends/deltalake.py | 328 +++++++++++++++++++++++++++++++
 examples/demo_deltalake.py       |  54 +++++
 setup.py                         |   2 +
 3 files changed, 384 insertions(+)
 create mode 100644 cryptofeed/backends/deltalake.py
 create mode 100644 examples/demo_deltalake.py

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
new file mode 100644
index 000000000..1fd08e555
--- /dev/null
+++ b/cryptofeed/backends/deltalake.py
@@ -0,0 +1,328 @@
+'''
+Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com
+
+Please see the LICENSE file for the terms and conditions
+associated with this software.
+'''
+from typing import Optional, List, Dict, Any
+import logging
+import pandas as pd
+from deltalake import DeltaTable, write_deltalake
+
+from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback
+from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK
+
+LOG = logging.getLogger('feedhandler')
+
+class DeltaLakeCallback(BackendQueue):
+    def __init__(self, 
+                 base_path: str, 
+                 key: Optional[str] = None, 
+                 custom_columns: Optional[Dict[str, str]] = None,
+                 partition_cols: Optional[List[str]] = None,
+                 optimize_interval: int = 100,
+                 z_order_cols: Optional[List[str]] = None,
+                 time_travel: bool = False,
+                 storage_options: Optional[Dict[str, Any]] = None,
+                 **kwargs: Any):
+        super().__init__()
+        self.key = key or self.default_key
+        self.base_path = base_path
+        self.delta_table_path = f"{self.base_path}/{self.key}"
+        self.custom_columns = custom_columns or {}
+        self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day']
+        self.optimize_interval = optimize_interval
+        self.z_order_cols = z_order_cols or self._default_z_order_cols()
+        self.time_travel = time_travel
+        self.storage_options = storage_options or {}
+        self.write_count = 0
+        self.running = True
+        
+        if optimize_interval <= 0:
+            raise ValueError("optimize_interval must be a positive integer")
+
+        if not isinstance(self.partition_cols, list):
+            raise TypeError("partition_cols must be a list of strings")
+
+        if not isinstance(self.z_order_cols, list):
+            raise TypeError("z_order_cols must be a list of strings")
+
+    def _default_z_order_cols(self) -> List[str]:
+        common_cols = ['exchange', 'symbol', 'timestamp']
+        data_specific_cols = {
+            TRADES: ['price', 'amount'],
+            FUNDING: ['rate'],
+            TICKER: ['bid', 'ask'],
+            OPEN_INTEREST: ['open_interest'],
+            LIQUIDATIONS: ['quantity', 'price'],
+            BOOK: [],  # Book data is typically queried by timestamp and symbol
+            CANDLES: ['open', 'close', 'high', 'low'],
+            ORDER_INFO: ['status', 'price', 'amount'],
+            TRANSACTIONS: ['type', 'amount'],
+            BALANCES: ['balance'],
+            FILLS: ['price', 'amount']
+        }
+        return common_cols + data_specific_cols.get(self.key, [])
+
+    async def writer(self):
+        while self.running:
+            async with self.read_queue() as updates:
+                if updates:
+                    df = pd.DataFrame(updates)
+                    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
+                    df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s')
+                    df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day
+                    
+                    # Reorder columns to put exchange and symbol first
+                    cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']]
+                    df = df[cols]
+                    
+                    if self.custom_columns:
+                        df = df.rename(columns=self.custom_columns)
+                    
+                    await self._write_batch(df)
+
+    async def _write_batch(self, df: pd.DataFrame):
+        if df.empty:
+            return
+
+        try:
+            LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
+            write_deltalake(
+                self.delta_table_path,
+                df,
+                mode="append",
+                partition_by=self.partition_cols,
+                schema_mode="merge",
+                storage_options=self.storage_options
+            )
+            self.write_count += 1
+
+            if self.write_count % self.optimize_interval == 0:
+                await self._optimize_table()
+
+            if self.time_travel:
+                self._update_metadata()
+
+        except Exception as e:
+            LOG.error(f"Error writing to Delta Lake: {e}")
+
+    async def _optimize_table(self):
+        LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
+        dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
+        dt.optimize.compact()
+        if self.z_order_cols:
+            dt.optimize.z_order(self.z_order_cols)
+
+    def _update_metadata(self):
+        dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
+        LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}")
+
+    async def stop(self):
+        self.running = False
+
+    def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
+        if self.time_travel:
+            dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
+            if timestamp:
+                return dt.version_at_timestamp(timestamp)
+            else:
+                return dt.version()
+        else:
+            LOG.warning("Time travel is not enabled for this table")
+            return None
+
+class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = TRADES
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - id: string (nullable)
+    - side: string
+    - amount: float64
+    - price: float64
+    - type: string (nullable)
+    """
+
+class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = FUNDING
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - mark_price: float64 (nullable)
+    - rate: float64
+    - next_funding_time: datetime64[ns] (nullable)
+    - predicted_rate: float64 (nullable)
+    """
+
+class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = TICKER
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - bid: float64
+    - ask: float64
+    """
+
+class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = OPEN_INTEREST
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - open_interest: float64
+    """
+
+class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = LIQUIDATIONS
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - side: string
+    - quantity: float64
+    - price: float64
+    - id: string
+    - status: string
+    """
+
+class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
+    default_key = BOOK
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - delta: dict (nullable, contains 'bid' and 'ask' updates)
+    - book: dict (contains full order book snapshot when available)
+    """
+
+class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = CANDLES
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - start: datetime64[ns]
+    - stop: datetime64[ns]
+    - interval: string
+    - trades: int64 (nullable)
+    - open: float64
+    - close: float64
+    - high: float64
+    - low: float64
+    - volume: float64
+    - closed: bool (nullable)
+    """
+
+class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = ORDER_INFO
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - id: string
+    - client_order_id: string (nullable)
+    - side: string
+    - status: string
+    - type: string
+    - price: float64
+    - amount: float64
+    - remaining: float64 (nullable)
+    - account: string (nullable)
+    """
+
+class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = TRANSACTIONS
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - currency: string
+    - type: string
+    - status: string
+    - amount: float64
+    """
+
+class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = BALANCES
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - currency: string
+    - balance: float64
+    - reserved: float64 (nullable)
+    """
+
+class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = FILLS
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - price: float64
+    - amount: float64
+    - side: string
+    - fee: float64 (nullable)
+    - id: string
+    - order_id: string
+    - liquidity: string
+    - type: string
+    - account: string (nullable)
+    """
\ No newline at end of file
diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py
new file mode 100644
index 000000000..5c973016e
--- /dev/null
+++ b/examples/demo_deltalake.py
@@ -0,0 +1,54 @@
+'''
+Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com
+
+Please see the LICENSE file for the terms and conditions
+associated with this software.
+'''
+from cryptofeed import FeedHandler
+from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake
+from cryptofeed.defines import FUNDING, TICKER, TRADES
+from cryptofeed.exchanges import Binance
+
+
+def main():
+    f = FeedHandler()
+    
+    # Define the Delta Lake base path (can be local or S3)
+    delta_base_path = 's3://your-bucket/path/to/delta/tables'
+
+    # S3 storage options (remove if using local storage)
+    s3_options = {
+        "AWS_ACCESS_KEY_ID": "your_access_key",
+        "AWS_SECRET_ACCESS_KEY": "your_secret_key",
+        "AWS_REGION": "your_region"
+    }
+
+    # Add Binance feed with Delta Lake callbacks
+    f.add_feed(Binance(
+        channels=[TRADES, FUNDING, TICKER],
+        symbols=['BTC-USDT', 'ETH-USDT'],
+        callbacks={
+            TRADES: TradeDeltaLake(
+                base_path=delta_base_path, 
+                optimize_interval=50,  # More frequent table optimization
+                time_travel=True,  # Enable time travel feature
+                storage_options=s3_options  # Add S3 configuration
+            ),
+            FUNDING: FundingDeltaLake(
+                base_path=delta_base_path,
+                storage_options=s3_options  # Add S3 configuration
+            ),
+            TICKER: TickerDeltaLake(
+                base_path=delta_base_path,
+                partition_cols=['exchange', 'symbol', 'year', 'month', 'day'],  # Custom partitioning
+                z_order_cols=['timestamp', 'bid', 'ask'],  # Enable Z-ordering
+                storage_options=s3_options  # Add S3 configuration
+            )
+        }
+    ))
+    
+    f.run()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/setup.py b/setup.py
index adf10f870..344573348 100644
--- a/setup.py
+++ b/setup.py
@@ -95,6 +95,7 @@ def run_tests(self):
         "rabbit": ["aio_pika", "pika"],
         "redis": ["hiredis", "redis>=4.5.1"],
         "zmq": ["pyzmq"],
+        "deltalake": ["deltalake>=0.6.1", "pandas"],
         "all": [
             "arctic",
             "google_cloud_pubsub>=2.4.1",
@@ -107,6 +108,7 @@ def run_tests(self):
             "hiredis",
             "redis>=4.5.1",
             "pyzmq",
+            "deltalake>=0.6.1",
         ],
     },
 )

From c003c69b76a44fc14ea63b3036279af096bcbb60 Mon Sep 17 00:00:00 2001
From: Tommy K <tommy.k.devops@gmail.com>
Date: Sun, 1 Sep 2024 00:11:55 +0200
Subject: [PATCH 02/87] feat(deltalake): optimize Delta Lake implementation

---
 cryptofeed/backends/deltalake.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 1fd08e555..bd29037ae 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -5,7 +5,9 @@
 associated with this software.
 '''
 from typing import Optional, List, Dict, Any
+from collections import defaultdict
 import logging
+
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
@@ -228,6 +230,11 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
     - book: dict (contains full order book snapshot when available)
     """
+    def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs):
+        self.snapshots_only = snapshots_only
+        self.snapshot_interval = snapshot_interval
+        self.snapshot_count = defaultdict(int)
+        super().__init__(*args, **kwargs)
 
 class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES

From f8256dc2b64544119cc207fb043512a6085c4ca2 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 00:51:23 +0200
Subject: [PATCH 03/87] fix(deltalake): fix book table name

---
 cryptofeed/backends/deltalake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index bd29037ae..c55475f2c 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -12,7 +12,7 @@
 from deltalake import DeltaTable, write_deltalake
 
 from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback
-from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK
+from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS
 
 LOG = logging.getLogger('feedhandler')
 
@@ -217,7 +217,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     """
 
 class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
-    default_key = BOOK
+    default_key = "book"
     """
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)

From cf2c92628725a227d19f993d57ca0c6afcd6bdb9 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 01:01:48 +0200
Subject: [PATCH 04/87] fix(deltalake): Fix book name

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c55475f2c..c1b882c34 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -57,7 +57,7 @@ def _default_z_order_cols(self) -> List[str]:
             TICKER: ['bid', 'ask'],
             OPEN_INTEREST: ['open_interest'],
             LIQUIDATIONS: ['quantity', 'price'],
-            BOOK: [],  # Book data is typically queried by timestamp and symbol
+            "book": [],  # Book data is typically queried by timestamp and symbol
             CANDLES: ['open', 'close', 'high', 'low'],
             ORDER_INFO: ['status', 'price', 'amount'],
             TRANSACTIONS: ['type', 'amount'],

From b02ab52443cd04b4790f1f1716a4575ac0cb8138 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 01:21:33 +0200
Subject: [PATCH 05/87] fix(deltalake): Fix numeric type

---
 cryptofeed/backends/deltalake.py | 35 ++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c1b882c34..fda44563e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -4,9 +4,10 @@
 Please see the LICENSE file for the terms and conditions
 associated with this software.
 '''
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Union
 from collections import defaultdict
 import logging
+import numpy as np
 
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
@@ -16,16 +17,19 @@
 
 LOG = logging.getLogger('feedhandler')
 
+
 class DeltaLakeCallback(BackendQueue):
-    def __init__(self, 
-                 base_path: str, 
-                 key: Optional[str] = None, 
+    def __init__(self,
+                 base_path: str,
+                 key: Optional[str] = None,
                  custom_columns: Optional[Dict[str, str]] = None,
                  partition_cols: Optional[List[str]] = None,
                  optimize_interval: int = 100,
                  z_order_cols: Optional[List[str]] = None,
                  time_travel: bool = False,
                  storage_options: Optional[Dict[str, Any]] = None,
+                 numeric_type: Union[type, str] = float,
+                 none_to: Any = None,
                  **kwargs: Any):
         super().__init__()
         self.key = key or self.default_key
@@ -49,6 +53,9 @@ def __init__(self,
         if not isinstance(self.z_order_cols, list):
             raise TypeError("z_order_cols must be a list of strings")
 
+        self.numeric_type = numeric_type
+        self.none_to = none_to
+
     def _default_z_order_cols(self) -> List[str]:
         common_cols = ['exchange', 'symbol', 'timestamp']
         data_specific_cols = {
@@ -89,6 +96,15 @@ async def _write_batch(self, df: pd.DataFrame):
             return
 
         try:
+            # Convert numeric columns to the specified numeric type
+            numeric_columns = df.select_dtypes(include=[np.number]).columns
+            for col in numeric_columns:
+                df[col] = df[col].astype(self.numeric_type)
+
+            # Replace None values with the specified value
+            if self.none_to is not None:
+                df = df.fillna(self.none_to)
+
             LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
             write_deltalake(
                 self.delta_table_path,
@@ -134,6 +150,7 @@ def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
             LOG.warning("Time travel is not enabled for this table")
             return None
 
+
 class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRADES
     """
@@ -152,6 +169,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     - type: string (nullable)
     """
 
+
 class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FUNDING
     """
@@ -169,6 +187,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     - predicted_rate: float64 (nullable)
     """
 
+
 class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TICKER
     """
@@ -184,6 +203,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     - ask: float64
     """
 
+
 class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = OPEN_INTEREST
     """
@@ -198,6 +218,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     - open_interest: float64
     """
 
+
 class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = LIQUIDATIONS
     """
@@ -216,6 +237,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     - status: string
     """
 
+
 class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     default_key = "book"
     """
@@ -236,6 +258,7 @@ def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs
         self.snapshot_count = defaultdict(int)
         super().__init__(*args, **kwargs)
 
+
 class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES
     """
@@ -259,6 +282,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     - closed: bool (nullable)
     """
 
+
 class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = ORDER_INFO
     """
@@ -281,6 +305,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     - account: string (nullable)
     """
 
+
 class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRANSACTIONS
     """
@@ -297,6 +322,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     - amount: float64
     """
 
+
 class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = BALANCES
     """
@@ -312,6 +338,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     - reserved: float64 (nullable)
     """
 
+
 class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FILLS
     """

From 42eea5544115e0b1bc3132f67cf9d31ccc3a09ea Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 17:26:24 +0200
Subject: [PATCH 06/87] fix: Ensure timestamp columns have nanosecond precision
 in DeltaLake backend

---
 .gitignore                       |   1 +
 cryptofeed/backends/deltalake.py | 109 +++++++++++++++++++------------
 2 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5860625f7..ac64f2b9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,4 @@ ENV/
 
 # PyCharm
 .idea/
+.aider*
diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index fda44563e..963488b29 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -1,49 +1,60 @@
-'''
+"""
 Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com
 
 Please see the LICENSE file for the terms and conditions
 associated with this software.
-'''
-from typing import Optional, List, Dict, Any, Union
-from collections import defaultdict
+"""
+
 import logging
-import numpy as np
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
 
+import numpy as np
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback
-from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+
 
-LOG = logging.getLogger('feedhandler')
+LOG = logging.getLogger("feedhandler")
 
 
 class DeltaLakeCallback(BackendQueue):
-    def __init__(self,
-                 base_path: str,
-                 key: Optional[str] = None,
-                 custom_columns: Optional[Dict[str, str]] = None,
-                 partition_cols: Optional[List[str]] = None,
-                 optimize_interval: int = 100,
-                 z_order_cols: Optional[List[str]] = None,
-                 time_travel: bool = False,
-                 storage_options: Optional[Dict[str, Any]] = None,
-                 numeric_type: Union[type, str] = float,
-                 none_to: Any = None,
-                 **kwargs: Any):
+    def __init__(
+        self,
+        base_path: str,
+        key: Optional[str] = None,
+        custom_columns: Optional[Dict[str, str]] = None,
+        partition_cols: Optional[List[str]] = None,
+        optimize_interval: int = 100,
+        z_order_cols: Optional[List[str]] = None,
+        time_travel: bool = False,
+        storage_options: Optional[Dict[str, Any]] = None,
+        numeric_type: Union[type, str] = float,
+        none_to: Any = None,
+        **kwargs: Any,
+    ):
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
         self.delta_table_path = f"{self.base_path}/{self.key}"
         self.custom_columns = custom_columns or {}
-        self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day']
+        self.partition_cols = partition_cols or [
+            "exchange",
+            "symbol",
+            "year",
+            "month",
+            "day",
+        ]
         self.optimize_interval = optimize_interval
         self.z_order_cols = z_order_cols or self._default_z_order_cols()
         self.time_travel = time_travel
         self.storage_options = storage_options or {}
         self.write_count = 0
         self.running = True
-        
+
         if optimize_interval <= 0:
             raise ValueError("optimize_interval must be a positive integer")
 
@@ -57,19 +68,19 @@ def __init__(self,
         self.none_to = none_to
 
     def _default_z_order_cols(self) -> List[str]:
-        common_cols = ['exchange', 'symbol', 'timestamp']
+        common_cols = ["exchange", "symbol", "timestamp"]
         data_specific_cols = {
-            TRADES: ['price', 'amount'],
-            FUNDING: ['rate'],
-            TICKER: ['bid', 'ask'],
-            OPEN_INTEREST: ['open_interest'],
-            LIQUIDATIONS: ['quantity', 'price'],
+            TRADES: ["price", "amount"],
+            FUNDING: ["rate"],
+            TICKER: ["bid", "ask"],
+            OPEN_INTEREST: ["open_interest"],
+            LIQUIDATIONS: ["quantity", "price"],
             "book": [],  # Book data is typically queried by timestamp and symbol
-            CANDLES: ['open', 'close', 'high', 'low'],
-            ORDER_INFO: ['status', 'price', 'amount'],
-            TRANSACTIONS: ['type', 'amount'],
-            BALANCES: ['balance'],
-            FILLS: ['price', 'amount']
+            CANDLES: ["open", "close", "high", "low"],
+            ORDER_INFO: ["status", "price", "amount"],
+            TRANSACTIONS: ["type", "amount"],
+            BALANCES: ["balance"],
+            FILLS: ["price", "amount"],
         }
         return common_cols + data_specific_cols.get(self.key, [])
 
@@ -78,17 +89,25 @@ async def writer(self):
             async with self.read_queue() as updates:
                 if updates:
                     df = pd.DataFrame(updates)
-                    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
-                    df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s')
-                    df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day
-                    
+                    df["date"] = pd.to_datetime(df["timestamp"], unit="s")
+                    df["receipt_timestamp"] = pd.to_datetime(
+                        df["receipt_timestamp"], unit="s"
+                    )
+                    df["year"], df["month"], df["day"] = (
+                        df["date"].dt.year,
+                        df["date"].dt.month,
+                        df["date"].dt.day,
+                    )
+
                     # Reorder columns to put exchange and symbol first
-                    cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']]
+                    cols = ["exchange", "symbol"] + [
+                        col for col in df.columns if col not in ["exchange", "symbol"]
+                    ]
                     df = df[cols]
-                    
+
                     if self.custom_columns:
                         df = df.rename(columns=self.custom_columns)
-                    
+
                     await self._write_batch(df)
 
     async def _write_batch(self, df: pd.DataFrame):
@@ -96,6 +115,11 @@ async def _write_batch(self, df: pd.DataFrame):
             return
 
         try:
+            # Ensure timestamp columns are in nanosecond precision
+            timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
+            for col in timestamp_columns:
+                df[col] = df[col].astype("datetime64[ns]")
+
             # Convert numeric columns to the specified numeric type
             numeric_columns = df.select_dtypes(include=[np.number]).columns
             for col in numeric_columns:
@@ -112,7 +136,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 mode="append",
                 partition_by=self.partition_cols,
                 schema_mode="merge",
-                storage_options=self.storage_options
+                storage_options=self.storage_options,
             )
             self.write_count += 1
 
@@ -252,6 +276,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
     - book: dict (contains full order book snapshot when available)
     """
+
     def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs):
         self.snapshots_only = snapshots_only
         self.snapshot_interval = snapshot_interval
@@ -359,4 +384,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     - liquidity: string
     - type: string
     - account: string (nullable)
-    """
\ No newline at end of file
+    """

From 090c4f4b9f8b114ac2beaa00d272dbab099ac73e Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 17:47:21 +0200
Subject: [PATCH 07/87] feat: Refactor timestamp column handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 963488b29..fa6050edb 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -13,9 +13,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -115,10 +129,10 @@ async def _write_batch(self, df: pd.DataFrame):
             return
 
         try:
-            # Ensure timestamp columns are in nanosecond precision
+            # Convert timestamp columns from ns to us
             timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
             for col in timestamp_columns:
-                df[col] = df[col].astype("datetime64[ns]")
+                df[col] = df[col].astype("datetime64[us]")
 
             # Convert numeric columns to the specified numeric type
             numeric_columns = df.select_dtypes(include=[np.number]).columns

From 2a60f20188e6850348b9e64f6e32806614607446 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 18:25:31 +0200
Subject: [PATCH 08/87] fix: Handle null values in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index fa6050edb..dca2dd07d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -82,7 +82,7 @@ def __init__(
         self.none_to = none_to
 
     def _default_z_order_cols(self) -> List[str]:
-        common_cols = ["exchange", "symbol", "timestamp"]
+        common_cols = ["timestamp"]
         data_specific_cols = {
             TRADES: ["price", "amount"],
             FUNDING: ["rate"],
@@ -96,7 +96,9 @@ def _default_z_order_cols(self) -> List[str]:
             BALANCES: ["balance"],
             FILLS: ["price", "amount"],
         }
-        return common_cols + data_specific_cols.get(self.key, [])
+        z_order_cols = common_cols + data_specific_cols.get(self.key, [])
+        # Remove any columns that are already in partition_cols
+        return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
         while self.running:
@@ -139,9 +141,20 @@ async def _write_batch(self, df: pd.DataFrame):
             for col in numeric_columns:
                 df[col] = df[col].astype(self.numeric_type)
 
-            # Replace None values with the specified value
+            # Handle null values
             if self.none_to is not None:
                 df = df.fillna(self.none_to)
+            else:
+                # Replace None with appropriate default values based on column type
+                for col in df.columns:
+                    if df[col].dtype == 'object':
+                        df[col] = df[col].fillna('')  # Replace None with empty string for object columns
+                    elif df[col].dtype in ['float64', 'int64']:
+                        df[col] = df[col].fillna(0)  # Replace None with 0 for numeric columns
+                    elif df[col].dtype == 'bool':
+                        df[col] = df[col].fillna(False)  # Replace None with False for boolean columns
+                    elif df[col].dtype == 'datetime64[us]':
+                        df[col] = df[col].fillna(pd.Timestamp.min)  # Replace None with minimum timestamp for datetime columns
 
             LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
             write_deltalake(

From 2f018152ecf34f7810be18f5a6cb58d5beaeef5a Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:51:21 +0200
Subject: [PATCH 09/87] feat: Implement DeltaLake backend for Cryptofeed

---
 cryptofeed/backends/deltalake.py | 357 +++++++++++++++++--------------
 1 file changed, 200 insertions(+), 157 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index dca2dd07d..d75277e1b 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -5,6 +5,7 @@
 associated with this software.
 """
 
+import asyncio
 import logging
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
@@ -13,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -55,13 +42,7 @@ def __init__(
         self.base_path = base_path
         self.delta_table_path = f"{self.base_path}/{self.key}"
         self.custom_columns = custom_columns or {}
-        self.partition_cols = partition_cols or [
-            "exchange",
-            "symbol",
-            "year",
-            "month",
-            "day",
-        ]
+        self.partition_cols = partition_cols or ["exchange", "symbol", "dt"]
         self.optimize_interval = optimize_interval
         self.z_order_cols = z_order_cols or self._default_z_order_cols()
         self.time_travel = time_travel
@@ -69,17 +50,31 @@ def __init__(
         self.write_count = 0
         self.running = True
 
-        if optimize_interval <= 0:
+        # Validate configuration parameters
+        self._validate_configuration()
+
+        self.numeric_type = numeric_type
+        self.none_to = none_to
+
+    def _validate_configuration(self):
+        if self.optimize_interval <= 0:
             raise ValueError("optimize_interval must be a positive integer")
 
-        if not isinstance(self.partition_cols, list):
+        if not isinstance(self.partition_cols, list) or not all(
+            isinstance(col, str) for col in self.partition_cols
+        ):
             raise TypeError("partition_cols must be a list of strings")
 
-        if not isinstance(self.z_order_cols, list):
+        if not isinstance(self.z_order_cols, list) or not all(
+            isinstance(col, str) for col in self.z_order_cols
+        ):
             raise TypeError("z_order_cols must be a list of strings")
 
-        self.numeric_type = numeric_type
-        self.none_to = none_to
+        if not isinstance(self.storage_options, dict):
+            raise TypeError("storage_options must be a dictionary")
+
+        if not isinstance(self.numeric_type, (type, str)):
+            raise TypeError("numeric_type must be a type or a string")
 
     def _default_z_order_cols(self) -> List[str]:
         common_cols = ["timestamp"]
@@ -104,17 +99,9 @@ async def writer(self):
         while self.running:
             async with self.read_queue() as updates:
                 if updates:
+                    LOG.info(f"Received {len(updates)} updates for processing.")
                     df = pd.DataFrame(updates)
-                    df["date"] = pd.to_datetime(df["timestamp"], unit="s")
-                    df["receipt_timestamp"] = pd.to_datetime(
-                        df["receipt_timestamp"], unit="s"
-                    )
-                    df["year"], df["month"], df["day"] = (
-                        df["date"].dt.year,
-                        df["date"].dt.month,
-                        df["date"].dt.day,
-                    )
-
+                    self._convert_fields(df)
                     # Reorder columns to put exchange and symbol first
                     cols = ["exchange", "symbol"] + [
                         col for col in df.columns if col not in ["exchange", "symbol"]
@@ -126,55 +113,126 @@ async def writer(self):
 
                     await self._write_batch(df)
 
+    def _convert_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting fields in DataFrame.")
+        self._convert_datetime_fields(df)
+        self._convert_category_fields(df)
+        self._convert_int_fields(df)
+
+    def _convert_datetime_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting datetime fields.")
+        datetime_columns = ["timestamp", "receipt_timestamp"]
+        for col in datetime_columns:
+            if col in df.columns:
+                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]")
+        if "timestamp" in df.columns:
+            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+
+    def _convert_category_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting category fields.")
+        category_columns = [
+            "exchange",
+            "symbol",
+            "side",
+            "type",
+            "status",
+            "currency",
+            "liquidity",
+        ]
+        for col in category_columns:
+            if col in df.columns:
+                df[col] = df[col].astype("category")
+
+    def _convert_int_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting integer fields.")
+        int_columns = ["id", "trade_id", "trades"]
+        for col in int_columns:
+            if col in df.columns:
+                df[col] = df[col].astype("int64")
+
     async def _write_batch(self, df: pd.DataFrame):
         if df.empty:
+            LOG.warning("DataFrame is empty. Skipping write operation.")
             return
 
-        try:
-            # Convert timestamp columns from ns to us
-            timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
-            for col in timestamp_columns:
-                df[col] = df[col].astype("datetime64[us]")
-
-            # Convert numeric columns to the specified numeric type
-            numeric_columns = df.select_dtypes(include=[np.number]).columns
-            for col in numeric_columns:
-                df[col] = df[col].astype(self.numeric_type)
+        max_retries = 3
+        retry_delay = 5  # seconds
+
+        for attempt in range(max_retries):
+            try:
+                LOG.info(
+                    f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
+                )
+                # Convert timestamp columns to datetime64[ns]
+                timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
+                for col in timestamp_columns:
+                    df[col] = df[col].astype("datetime64[ns]")
+
+                # Convert numeric columns to the specified numeric type
+                numeric_columns = df.select_dtypes(include=[np.number]).columns
+                for col in numeric_columns:
+                    df[col] = df[col].astype(self.numeric_type)
+
+                # Handle null values
+                df = self._handle_null_values(df)
+
+                LOG.info(
+                    f"Writing batch of {len(df)} records to {self.delta_table_path}"
+                )
+                write_deltalake(
+                    self.delta_table_path,
+                    df,
+                    mode="append",
+                    partition_by=self.partition_cols,
+                    schema_mode="merge",
+                    storage_options=self.storage_options,
+                )
+                self.write_count += 1
+
+                if self.write_count % self.optimize_interval == 0:
+                    await self._optimize_table()
+
+                if self.time_travel:
+                    self._update_metadata()
+
+                LOG.info("Batch write successful.")
+                break  # Exit the retry loop if write is successful
+
+            except Exception as e:
+                LOG.error(
+                    f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
+                )
+                if attempt < max_retries - 1:
+                    LOG.info(f"Retrying in {retry_delay} seconds...")
+                    await asyncio.sleep(retry_delay)
+                else:
+                    LOG.error(
+                        "Max retries reached. Failed to write batch to Delta Lake."
+                    )
 
-            # Handle null values
-            if self.none_to is not None:
-                df = df.fillna(self.none_to)
-            else:
-                # Replace None with appropriate default values based on column type
-                for col in df.columns:
-                    if df[col].dtype == 'object':
-                        df[col] = df[col].fillna('')  # Replace None with empty string for object columns
-                    elif df[col].dtype in ['float64', 'int64']:
-                        df[col] = df[col].fillna(0)  # Replace None with 0 for numeric columns
-                    elif df[col].dtype == 'bool':
-                        df[col] = df[col].fillna(False)  # Replace None with False for boolean columns
-                    elif df[col].dtype == 'datetime64[us]':
-                        df[col] = df[col].fillna(pd.Timestamp.min)  # Replace None with minimum timestamp for datetime columns
-
-            LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
-            write_deltalake(
-                self.delta_table_path,
-                df,
-                mode="append",
-                partition_by=self.partition_cols,
-                schema_mode="merge",
-                storage_options=self.storage_options,
-            )
-            self.write_count += 1
-
-            if self.write_count % self.optimize_interval == 0:
-                await self._optimize_table()
-
-            if self.time_travel:
-                self._update_metadata()
-
-        except Exception as e:
-            LOG.error(f"Error writing to Delta Lake: {e}")
+    def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
+        if self.none_to is not None:
+            return df.fillna(self.none_to)
+        else:
+            # Replace None with appropriate default values based on column type
+            for col in df.columns:
+                if df[col].dtype == "object":
+                    df[col] = df[col].fillna(
+                        ""
+                    )  # Replace None with empty string for object columns
+                elif df[col].dtype in ["float64", "int64"]:
+                    df[col] = df[col].fillna(
+                        0
+                    )  # Replace None with 0 for numeric columns
+                elif df[col].dtype == "bool":
+                    df[col] = df[col].fillna(
+                        False
+                    )  # Replace None with False for boolean columns
+                elif df[col].dtype == "datetime64[ns]":
+                    df[col] = df[col].fillna(
+                        pd.Timestamp.min
+                    )  # Replace None with minimum timestamp for datetime columns
+            return df
 
     async def _optimize_table(self):
         LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
@@ -182,21 +240,27 @@ async def _optimize_table(self):
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
+        LOG.info("OPTIMIZE operation completed.")
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}")
 
     async def stop(self):
+        LOG.info("Stopping DeltaLakeCallback writer.")
         self.running = False
 
     def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
         if self.time_travel:
             dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
             if timestamp:
-                return dt.version_at_timestamp(timestamp)
+                version = dt.version_at_timestamp(timestamp)
+                LOG.info(f"Retrieved version {version} for timestamp {timestamp}.")
+                return version
             else:
-                return dt.version()
+                version = dt.version()
+                LOG.info(f"Retrieved current version {version}.")
+                return version
         else:
             LOG.warning("Time travel is not enabled for this table")
             return None
@@ -208,16 +272,15 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
-    - id: string (nullable)
-    - side: string
+    - dt: string
+    - exchange: category
+    - symbol: category
+    - id: int64 (nullable)
+    - side: category
     - amount: float64
     - price: float64
-    - type: string (nullable)
+    - type: category (nullable)
+    - trade_id: int64
     """
 
 
@@ -227,11 +290,9 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - mark_price: float64 (nullable)
     - rate: float64
     - next_funding_time: datetime64[ns] (nullable)
@@ -245,11 +306,9 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - bid: float64
     - ask: float64
     """
@@ -261,11 +320,9 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - open_interest: float64
     """
 
@@ -276,16 +333,14 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
-    - side: string
+    - dt: string
+    - exchange: category
+    - symbol: category
+    - side: category
     - quantity: float64
     - price: float64
-    - id: string
-    - status: string
+    - id: int64
+    - status: category
     """
 
 
@@ -295,11 +350,9 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
     - book: dict (contains full order book snapshot when available)
     """
@@ -317,11 +370,9 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - start: datetime64[ns]
     - stop: datetime64[ns]
     - interval: string
@@ -341,16 +392,14 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
-    - id: string
+    - dt: string
+    - exchange: category
+    - symbol: category
+    - id: int64
     - client_order_id: string (nullable)
-    - side: string
-    - status: string
-    - type: string
+    - side: category
+    - status: category
+    - type: category
     - price: float64
     - amount: float64
     - remaining: float64 (nullable)
@@ -364,13 +413,11 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - currency: string
-    - type: string
-    - status: string
+    - dt: string
+    - exchange: category
+    - currency: category
+    - type: category
+    - status: category
     - amount: float64
     """
 
@@ -381,11 +428,9 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - currency: string
+    - dt: string
+    - exchange: category
+    - currency: category
     - balance: float64
     - reserved: float64 (nullable)
     """
@@ -397,18 +442,16 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - price: float64
     - amount: float64
-    - side: string
+    - side: category
     - fee: float64 (nullable)
-    - id: string
-    - order_id: string
-    - liquidity: string
-    - type: string
+    - id: int64
+    - order_id: int64
+    - liquidity: category
+    - type: category
     - account: string (nullable)
     """

From 2081b11d6162bd596176822e3216e85f26d5ace2 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:57:16 +0200
Subject: [PATCH 10/87] fix: Refactor DeltaLakeCallback class

---
 cryptofeed/backends/deltalake.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d75277e1b..6e158f364 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -49,12 +63,10 @@ def __init__(
         self.storage_options = storage_options or {}
         self.write_count = 0
         self.running = True
-
-        # Validate configuration parameters
-        self._validate_configuration()
-
         self.numeric_type = numeric_type
         self.none_to = none_to
+        # Validate configuration parameters
+        self._validate_configuration()
 
     def _validate_configuration(self):
         if self.optimize_interval <= 0:

From a9ba6b224cb9333535e9693c2c3cd63fb3447a91 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 13:13:37 +0200
Subject: [PATCH 11/87] fix: Add debug logging for DataFrame schema in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 6e158f364..7991f84cb 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -175,6 +175,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.info(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
                 )
+                # Debug output the schema of the DataFrame
+                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
+
                 # Convert timestamp columns to datetime64[ns]
                 timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
                 for col in timestamp_columns:
@@ -191,6 +194,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.info(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
                 )
+                # Debug output the schema of the DataFrame
+                LOG.debug(f"DataFrame schema before write:\n{df.dtypes}")
+
                 write_deltalake(
                     self.delta_table_path,
                     df,

From 4ab9fd87939a8e8a0c0a4d30d914018977abf539 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 13:20:07 +0200
Subject: [PATCH 12/87] fix: Add DataFrame schema logging when
 timestamp-related error occurs during Delta Lake write

---
 cryptofeed/backends/deltalake.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 7991f84cb..ffb47db7d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -217,9 +217,13 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
+                # When error is related to timestamp, print the schema of the DataFrame
+                if "timestamp" in str(e):
+                    LOG.error(f"DataFrame schema:\n{df.dtypes}")
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
                 )
+
                 if attempt < max_retries - 1:
                     LOG.info(f"Retrying in {retry_delay} seconds...")
                     await asyncio.sleep(retry_delay)

From 797a789d5a9b5a905e48e8d632546cc292a402b0 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 16:36:58 +0200
Subject: [PATCH 13/87] fix: convert timestamp columns to datetime64[ms]

---
 cryptofeed/backends/deltalake.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index ffb47db7d..57084ec1a 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -136,7 +136,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame):
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]")
+                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
         if "timestamp" in df.columns:
             df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
 
@@ -178,10 +178,10 @@ async def _write_batch(self, df: pd.DataFrame):
                 # Debug output the schema of the DataFrame
                 LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
-                # Convert timestamp columns to datetime64[ns]
+                # Convert timestamp columns to datetime64[ms]
                 timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
                 for col in timestamp_columns:
-                    df[col] = df[col].astype("datetime64[ns]")
+                    df[col] = df[col].astype("datetime64[ms]")
 
                 # Convert numeric columns to the specified numeric type
                 numeric_columns = df.select_dtypes(include=[np.number]).columns

From ceb8f76f54117c5e73c5fb688a90fef2b1a83dac Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 16:59:19 +0200
Subject: [PATCH 14/87] fix: Ensure all partition columns are present in the
 DataFrame

---
 cryptofeed/backends/deltalake.py | 36 +++++++++++++++-----------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 57084ec1a..32e9a902a 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -167,6 +153,18 @@ async def _write_batch(self, df: pd.DataFrame):
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
 
+        # Ensure all partition columns are present in the DataFrame
+        for col in self.partition_cols:
+            if col not in df.columns:
+                if col == "exchange" or col == "symbol":
+                    df[col] = ""  # Default to empty string for categorical columns
+                elif col == "dt":
+                    df[col] = pd.Timestamp.min.strftime(
+                        "%Y-%m-%d"
+                    )  # Default to min date for date columns
+                else:
+                    df[col] = 0  # Default to 0 for numeric columns
+
         max_retries = 3
         retry_delay = 5  # seconds
 
@@ -218,8 +216,8 @@ async def _write_batch(self, df: pd.DataFrame):
 
             except Exception as e:
                 # When error is related to timestamp, print the schema of the DataFrame
-                if "timestamp" in str(e):
-                    LOG.error(f"DataFrame schema:\n{df.dtypes}")
+                LOG.error(f"DataFrame schema:\n{df.dtypes}")
+
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
                 )

From 84c436d73b0685aeb7aeb04b18195ad689ac7844 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:00:25 +0200
Subject: [PATCH 15/87] fix: convert timestamp column to datetime64[ms] format

---
 cryptofeed/backends/deltalake.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 32e9a902a..cb9f98328 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -124,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame):
             if col in df.columns:
                 df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+            df["dt"] = df["timestamp"].dt.date.astype("string")
 
     def _convert_category_fields(self, df: pd.DataFrame):
         LOG.debug("Converting category fields.")
@@ -248,9 +262,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
                     df[col] = df[col].fillna(
                         False
                     )  # Replace None with False for boolean columns
-                elif df[col].dtype == "datetime64[ns]":
+                elif df[col].dtype == "datetime64[ms]":
                     df[col] = df[col].fillna(
-                        pd.Timestamp.min
+                        pd.Timestamp.min.astype("datetime64[ms]")
                     )  # Replace None with minimum timestamp for datetime columns
             return df
 

From 84fd533c13915a3dc83d6070fa1fb73fd63e76dd Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:11:45 +0200
Subject: [PATCH 16/87] feat: Convert timestamp column to date string format in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index cb9f98328..d05ebb06d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -138,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame):
             if col in df.columns:
                 df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].dt.date.astype("string")
+            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
 
     def _convert_category_fields(self, df: pd.DataFrame):
         LOG.debug("Converting category fields.")

From 5e4c4dfcb4ef6fc83324225c4818d159e89f0027 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:32:15 +0200
Subject: [PATCH 17/87] refactor: Simplify null value handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d05ebb06d..e38a755c6 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -248,25 +248,18 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
         if self.none_to is not None:
             return df.fillna(self.none_to)
         else:
-            # Replace None with appropriate default values based on column type
             for col in df.columns:
-                if df[col].dtype == "object":
-                    df[col] = df[col].fillna(
-                        ""
-                    )  # Replace None with empty string for object columns
-                elif df[col].dtype in ["float64", "int64"]:
-                    df[col] = df[col].fillna(
-                        0
-                    )  # Replace None with 0 for numeric columns
-                elif df[col].dtype == "bool":
-                    df[col] = df[col].fillna(
-                        False
-                    )  # Replace None with False for boolean columns
-                elif df[col].dtype == "datetime64[ms]":
-                    df[col] = df[col].fillna(
-                        pd.Timestamp.min.astype("datetime64[ms]")
-                    )  # Replace None with minimum timestamp for datetime columns
-            return df
+                if pd.api.types.is_string_dtype(
+                    df[col]
+                ) or pd.api.types.is_categorical_dtype(df[col]):
+                    df[col] = df[col].fillna("")
+                elif pd.api.types.is_numeric_dtype(df[col]):
+                    df[col] = df[col].fillna(0)
+                elif pd.api.types.is_bool_dtype(df[col]):
+                    df[col] = df[col].fillna(False)
+                elif pd.api.types.is_datetime64_any_dtype(df[col]):
+                    df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]")
+        return df
 
     async def _optimize_table(self):
         LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")

From 5bdd670bdb972343316d85bc776dbe1b3cdb09a9 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:42:41 +0200
Subject: [PATCH 18/87] fix: Ensure empty string is a category in categorical
 columns and handle null values correctly

---
 cryptofeed/backends/deltalake.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index e38a755c6..c15c85c10 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -153,7 +153,11 @@ def _convert_category_fields(self, df: pd.DataFrame):
         ]
         for col in category_columns:
             if col in df.columns:
-                df[col] = df[col].astype("category")
+                # Add empty string as a category if it's not already present
+                categories = df[col].unique().tolist()
+                if '' not in categories:
+                    categories.append('')
+                df[col] = pd.Categorical(df[col], categories=categories)
 
     def _convert_int_fields(self, df: pd.DataFrame):
         LOG.debug("Converting integer fields.")
@@ -249,9 +253,12 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
             return df.fillna(self.none_to)
         else:
             for col in df.columns:
-                if pd.api.types.is_string_dtype(
-                    df[col]
-                ) or pd.api.types.is_categorical_dtype(df[col]):
+                if pd.api.types.is_categorical_dtype(df[col]):
+                    # Ensure '' is in the categories before filling
+                    if '' not in df[col].cat.categories:
+                        df[col] = df[col].cat.add_categories([''])
+                    df[col] = df[col].fillna('')
+                elif pd.api.types.is_string_dtype(df[col]):
                     df[col] = df[col].fillna("")
                 elif pd.api.types.is_numeric_dtype(df[col]):
                     df[col] = df[col].fillna(0)

From 099c6c4f9cea7b540cfba719f6480f9b460cb62b Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:54:28 +0200
Subject: [PATCH 19/87] refactor: Refactor DeltaLakeCallback class to improve
 code readability and maintainability

---
 cryptofeed/backends/deltalake.py | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c15c85c10..b9ac6c296 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -128,7 +114,7 @@ async def writer(self):
     def _convert_fields(self, df: pd.DataFrame):
         LOG.debug("Converting fields in DataFrame.")
         self._convert_datetime_fields(df)
-        self._convert_category_fields(df)
+        # self._convert_category_fields(df)
         self._convert_int_fields(df)
 
     def _convert_datetime_fields(self, df: pd.DataFrame):
@@ -155,8 +141,8 @@ def _convert_category_fields(self, df: pd.DataFrame):
             if col in df.columns:
                 # Add empty string as a category if it's not already present
                 categories = df[col].unique().tolist()
-                if '' not in categories:
-                    categories.append('')
+                if "" not in categories:
+                    categories.append("")
                 df[col] = pd.Categorical(df[col], categories=categories)
 
     def _convert_int_fields(self, df: pd.DataFrame):
@@ -255,9 +241,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
             for col in df.columns:
                 if pd.api.types.is_categorical_dtype(df[col]):
                     # Ensure '' is in the categories before filling
-                    if '' not in df[col].cat.categories:
-                        df[col] = df[col].cat.add_categories([''])
-                    df[col] = df[col].fillna('')
+                    if "" not in df[col].cat.categories:
+                        df[col] = df[col].cat.add_categories([""])
+                    df[col] = df[col].fillna("")
                 elif pd.api.types.is_string_dtype(df[col]):
                     df[col] = df[col].fillna("")
                 elif pd.api.types.is_numeric_dtype(df[col]):

From d3a9c717a4e788df330538a6e6ed058ad63b4f08 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:00:09 +0200
Subject: [PATCH 20/87] fix: Improve error handling and logging in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 34 +++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index b9ac6c296..0ed17520b 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -219,8 +233,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
-                # When error is related to timestamp, print the schema of the DataFrame
+                # When error is related to timestamp, print the schema of the DataFrame and the df
                 LOG.error(f"DataFrame schema:\n{df.dtypes}")
+                LOG.error(f"DataFrame:\n{df}")
 
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
@@ -239,19 +254,16 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
             return df.fillna(self.none_to)
         else:
             for col in df.columns:
-                if pd.api.types.is_categorical_dtype(df[col]):
-                    # Ensure '' is in the categories before filling
-                    if "" not in df[col].cat.categories:
-                        df[col] = df[col].cat.add_categories([""])
-                    df[col] = df[col].fillna("")
-                elif pd.api.types.is_string_dtype(df[col]):
+                if pd.api.types.is_string_dtype(df[col]):
                     df[col] = df[col].fillna("")
                 elif pd.api.types.is_numeric_dtype(df[col]):
                     df[col] = df[col].fillna(0)
                 elif pd.api.types.is_bool_dtype(df[col]):
                     df[col] = df[col].fillna(False)
                 elif pd.api.types.is_datetime64_any_dtype(df[col]):
-                    df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]")
+                    df[col] = df[col].fillna(pd.Timestamp.min)
+                else:
+                    df[col] = df[col].fillna(None)
         return df
 
     async def _optimize_table(self):

From fd825345a05f2e7a0607833c96557693ee165a66 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:07:09 +0200
Subject: [PATCH 21/87] fix: Optimize Delta Lake table by filling null values
 with empty strings

---
 cryptofeed/backends/deltalake.py | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 0ed17520b..ccc8232b5 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -263,7 +249,7 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
                 elif pd.api.types.is_datetime64_any_dtype(df[col]):
                     df[col] = df[col].fillna(pd.Timestamp.min)
                 else:
-                    df[col] = df[col].fillna(None)
+                    df[col] = df[col].fillna("")
         return df
 
     async def _optimize_table(self):

From 1b37d26de3e9c8972ad17bf053b5f7a75ce53b34 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:44:08 +0200
Subject: [PATCH 22/87] fix: optimize handling of missing data in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index ccc8232b5..16a25e2b1 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -249,7 +263,8 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
                 elif pd.api.types.is_datetime64_any_dtype(df[col]):
                     df[col] = df[col].fillna(pd.Timestamp.min)
                 else:
-                    df[col] = df[col].fillna("")
+                    # For any other data types, use an empty string as a fallback
+                    df[col] = df[col].astype(object).fillna("")
         return df
 
     async def _optimize_table(self):

From 81870dc8bd5ad3a16b210b86bee0792393a4ac80 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 03:55:54 +0200
Subject: [PATCH 23/87] feat: Add custom transformations and improve column
 validation in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 217 +++++++++++++++++++------------
 1 file changed, 131 insertions(+), 86 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 16a25e2b1..e135b87c0 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -49,6 +49,7 @@ def __init__(
         storage_options: Optional[Dict[str, Any]] = None,
         numeric_type: Union[type, str] = float,
         none_to: Any = None,
+        custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
         super().__init__()
@@ -65,6 +66,16 @@ def __init__(
         self.running = True
         self.numeric_type = numeric_type
         self.none_to = none_to
+        self.transformations = [
+            self._rename_custom_columns,
+            self._convert_datetime_columns,
+            self._convert_int_columns,
+            self._ensure_partition_columns,
+            self._handle_missing_values,
+            self._reorder_columns,
+        ]
+        if custom_transformations:
+            self.transformations.extend(custom_transformations)
         # Validate configuration parameters
         self._validate_configuration()
 
@@ -112,76 +123,146 @@ async def writer(self):
             async with self.read_queue() as updates:
                 if updates:
                     LOG.info(f"Received {len(updates)} updates for processing.")
+
                     df = pd.DataFrame(updates)
-                    self._convert_fields(df)
-                    # Reorder columns to put exchange and symbol first
-                    cols = ["exchange", "symbol"] + [
-                        col for col in df.columns if col not in ["exchange", "symbol"]
-                    ]
-                    df = df[cols]
 
-                    if self.custom_columns:
-                        df = df.rename(columns=self.custom_columns)
+                    self._transform_columns(df)
+                    self._validate_columns(df)
 
                     await self._write_batch(df)
 
-    def _convert_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting fields in DataFrame.")
-        self._convert_datetime_fields(df)
-        # self._convert_category_fields(df)
-        self._convert_int_fields(df)
+    def _validate_columns(self, df: pd.DataFrame):
+        LOG.debug("Validating DataFrame columns.")
+        # Check for required columns
+        required_columns = ["exchange", "symbol", "dt"]
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
+
+        # Validate partition columns
+        for col in self.partition_cols:
+            if col not in df.columns:
+                raise ValueError(f"Partition column '{col}' not found in DataFrame")
+            if df[col].isnull().any():
+                raise ValueError(f"Partition column '{col}' contains null values")
+
+        # Validate data types
+        expected_types = {
+            "exchange": "object",
+            "symbol": "object",
+            "dt": "object",
+            "timestamp": "datetime64[ms]",
+            "receipt_timestamp": "datetime64[ms]",
+        }
+        for col, expected_type in expected_types.items():
+            if col in df.columns and not df[col].dtype == expected_type:
+                raise TypeError(
+                    f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
+                )
+
+        LOG.debug("DataFrame columns validation completed successfully.")
+
+    def _transform_columns(self, df: pd.DataFrame):
+        LOG.debug("Transforming columns in DataFrame.")
+        for transformation in self.transformations:
+            transformation(df)
+
+    def _rename_custom_columns(self, df: pd.DataFrame):
+        if self.custom_columns:
+            LOG.debug("Renaming columns based on custom_columns configuration.")
+            df.rename(columns=self.custom_columns, inplace=True)
+
+    def _reorder_columns(self, df: pd.DataFrame):
+        LOG.debug("Reordering columns to prioritize exchange and symbol.")
+        cols = ["exchange", "symbol"] + [
+            col for col in df.columns if col not in ["exchange", "symbol"]
+        ]
+        df.reindex(columns=cols, inplace=True)
 
-    def _convert_datetime_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime fields.")
+    def _convert_datetime_columns(self, df: pd.DataFrame):
+        LOG.debug("Converting datetime columns.")
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
+                df[col] = pd.to_datetime(df[col], unit="ms")
+
+        # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         if "timestamp" in df.columns:
             df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+        elif "receipt_timestamp" in df.columns:
+            df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d")
+        else:
+            LOG.warning("No timestamp column found. Using current date for 'dt'.")
+            df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d")
 
-    def _convert_category_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting category fields.")
-        category_columns = [
-            "exchange",
-            "symbol",
-            "side",
-            "type",
-            "status",
-            "currency",
-            "liquidity",
-        ]
-        for col in category_columns:
-            if col in df.columns:
-                # Add empty string as a category if it's not already present
-                categories = df[col].unique().tolist()
-                if "" not in categories:
-                    categories.append("")
-                df[col] = pd.Categorical(df[col], categories=categories)
-
-    def _convert_int_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting integer fields.")
+    def _convert_int_columns(self, df: pd.DataFrame):
+        LOG.debug("Converting integer columns.")
         int_columns = ["id", "trade_id", "trades"]
         for col in int_columns:
             if col in df.columns:
-                df[col] = df[col].astype("int64")
+                df[col] = pd.to_numeric(df[col], errors="coerce").astype(
+                    "Int64"
+                )  # Use nullable integer type
 
-    async def _write_batch(self, df: pd.DataFrame):
-        if df.empty:
-            LOG.warning("DataFrame is empty. Skipping write operation.")
-            return
-
-        # Ensure all partition columns are present in the DataFrame
+    def _ensure_partition_columns(self, df: pd.DataFrame):
+        LOG.debug("Ensuring all partition columns are present and not null.")
         for col in self.partition_cols:
             if col not in df.columns:
-                if col == "exchange" or col == "symbol":
-                    df[col] = ""  # Default to empty string for categorical columns
+                if col in ["exchange", "symbol"]:
+                    df[col] = "unknown"
                 elif col == "dt":
-                    df[col] = pd.Timestamp.min.strftime(
-                        "%Y-%m-%d"
-                    )  # Default to min date for date columns
+                    # 'dt' should already be created in _convert_datetime_columns
+                    LOG.warning("'dt' column not found. This should not happen.")
+                    df[col] = pd.Timestamp.now().strftime("%Y-%m-%d")
                 else:
-                    df[col] = 0  # Default to 0 for numeric columns
+                    df[col] = "unknown"
+
+            # Fill any remaining null values
+            if df[col].isnull().any():
+                LOG.warning(
+                    f"Found null values in partition column {col}. Filling with default values."
+                )
+                df[col] = df[col].fillna(
+                    "unknown"
+                    if col != "dt"
+                    else pd.Timestamp.now().strftime("%Y-%m-%d")
+                )
+
+    def _handle_missing_values(self, df: pd.DataFrame):
+        LOG.debug("Handling missing values.")
+        for col in df.columns:
+            if col in ["exchange", "symbol"]:  # Removed 'dt' from this list
+                # These are partition columns and should never be null
+                if df[col].isnull().any():
+                    LOG.warning(
+                        f"Found null values in partition column {col}. Filling with default values."
+                    )
+                    df[col] = df[col].fillna("unknown")
+            elif pd.api.types.is_numeric_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else 0
+                )
+            elif pd.api.types.is_string_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else ""
+                )
+            elif pd.api.types.is_bool_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else False
+                )
+            elif pd.api.types.is_datetime64_any_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else pd.NaT
+                )
+            else:
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else ""
+                )
+
+    async def _write_batch(self, df: pd.DataFrame):
+        if df.empty:
+            LOG.warning("DataFrame is empty. Skipping write operation.")
+            return
 
         max_retries = 3
         retry_delay = 5  # seconds
@@ -191,27 +272,11 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.info(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
                 )
-                # Debug output the schema of the DataFrame
                 LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
-                # Convert timestamp columns to datetime64[ms]
-                timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
-                for col in timestamp_columns:
-                    df[col] = df[col].astype("datetime64[ms]")
-
-                # Convert numeric columns to the specified numeric type
-                numeric_columns = df.select_dtypes(include=[np.number]).columns
-                for col in numeric_columns:
-                    df[col] = df[col].astype(self.numeric_type)
-
-                # Handle null values
-                df = self._handle_null_values(df)
-
                 LOG.info(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
                 )
-                # Debug output the schema of the DataFrame
-                LOG.debug(f"DataFrame schema before write:\n{df.dtypes}")
 
                 write_deltalake(
                     self.delta_table_path,
@@ -233,10 +298,8 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
-                # When error is related to timestamp, print the schema of the DataFrame and the df
                 LOG.error(f"DataFrame schema:\n{df.dtypes}")
                 LOG.error(f"DataFrame:\n{df}")
-
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
                 )
@@ -249,24 +312,6 @@ async def _write_batch(self, df: pd.DataFrame):
                         "Max retries reached. Failed to write batch to Delta Lake."
                     )
 
-    def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
-        if self.none_to is not None:
-            return df.fillna(self.none_to)
-        else:
-            for col in df.columns:
-                if pd.api.types.is_string_dtype(df[col]):
-                    df[col] = df[col].fillna("")
-                elif pd.api.types.is_numeric_dtype(df[col]):
-                    df[col] = df[col].fillna(0)
-                elif pd.api.types.is_bool_dtype(df[col]):
-                    df[col] = df[col].fillna(False)
-                elif pd.api.types.is_datetime64_any_dtype(df[col]):
-                    df[col] = df[col].fillna(pd.Timestamp.min)
-                else:
-                    # For any other data types, use an empty string as a fallback
-                    df[col] = df[col].astype(object).fillna("")
-        return df
-
     async def _optimize_table(self):
         LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From 880c079063d4df2817f7509ffff9c85245406466 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:11:14 +0200
Subject: [PATCH 24/87] fix: Add logging configuration to deltalake backend

---
 cryptofeed/backends/deltalake.py | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index e135b87c0..94628ad1e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -10,28 +10,16 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
 
-import numpy as np
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
+# Add these lines after the imports
+logging.basicConfig(level=logging.DEBUG)
+logging.getLogger().setLevel(logging.DEBUG)
 
 LOG = logging.getLogger("feedhandler")
 

From e45dee7f3ed1e59c43cbe2f2308696c039279a04 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:19:36 +0200
Subject: [PATCH 25/87] fix: Initialize DeltaLakeCallback and add logging for
 writer method and _write_batch

---
 cryptofeed/backends/deltalake.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 94628ad1e..91c139d5b 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -40,6 +40,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
+        LOG.debug("Initializing DeltaLakeCallback")
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -107,8 +108,10 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
+        LOG.debug("Writer method called")
         while self.running:
             async with self.read_queue() as updates:
+                LOG.debug(f"Read queue returned {len(updates)} updates")
                 if updates:
                     LOG.info(f"Received {len(updates)} updates for processing.")
 
@@ -248,6 +251,7 @@ def _handle_missing_values(self, df: pd.DataFrame):
                 )
 
     async def _write_batch(self, df: pd.DataFrame):
+        LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}")
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return

From fcaa65f4c88ba467e62e34ebcd83eec30814db0e Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:23:20 +0200
Subject: [PATCH 26/87] fix: Change logging levels from DEBUG to WARNING in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 43 ++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 91c139d5b..68250f6c8 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -17,9 +17,10 @@
 from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
                                 OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
+
 # Add these lines after the imports
-logging.basicConfig(level=logging.DEBUG)
-logging.getLogger().setLevel(logging.DEBUG)
+# logging.basicConfig(level=logging.DEBUG)
+# logging.getLogger().setLevel(logging.DEBUG)
 
 LOG = logging.getLogger("feedhandler")
 
@@ -40,7 +41,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
-        LOG.debug("Initializing DeltaLakeCallback")
+        LOG.warning("Initializing DeltaLakeCallback")  # Changed to warning
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -108,12 +109,16 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
-        LOG.debug("Writer method called")
+        LOG.warning("Writer method called")  # Changed to warning
         while self.running:
             async with self.read_queue() as updates:
-                LOG.debug(f"Read queue returned {len(updates)} updates")
+                LOG.warning(
+                    f"Read queue returned {len(updates)} updates"
+                )  # Changed to warning
                 if updates:
-                    LOG.info(f"Received {len(updates)} updates for processing.")
+                    LOG.warning(
+                        f"Received {len(updates)} updates for processing."
+                    )  # Changed to warning
 
                     df = pd.DataFrame(updates)
 
@@ -251,7 +256,9 @@ def _handle_missing_values(self, df: pd.DataFrame):
                 )
 
     async def _write_batch(self, df: pd.DataFrame):
-        LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}")
+        LOG.warning(
+            f"_write_batch called with DataFrame of shape {df.shape}"
+        )  # Changed to warning
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
@@ -261,14 +268,14 @@ async def _write_batch(self, df: pd.DataFrame):
 
         for attempt in range(max_retries):
             try:
-                LOG.info(
+                LOG.warning(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
-                )
-                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
+                )  # Changed to warning
+                LOG.warning(f"DataFrame schema:\n{df.dtypes}")  # Changed to warning
 
-                LOG.info(
+                LOG.warning(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
-                )
+                )  # Changed to warning
 
                 write_deltalake(
                     self.delta_table_path,
@@ -286,7 +293,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 if self.time_travel:
                     self._update_metadata()
 
-                LOG.info("Batch write successful.")
+                LOG.warning("Batch write successful.")  # Changed to warning
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
@@ -297,7 +304,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 )
 
                 if attempt < max_retries - 1:
-                    LOG.info(f"Retrying in {retry_delay} seconds...")
+                    LOG.warning(
+                        f"Retrying in {retry_delay} seconds..."
+                    )  # Changed to warning
                     await asyncio.sleep(retry_delay)
                 else:
                     LOG.error(
@@ -305,12 +314,14 @@ async def _write_batch(self, df: pd.DataFrame):
                     )
 
     async def _optimize_table(self):
-        LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
+        LOG.warning(
+            f"Running OPTIMIZE on table {self.delta_table_path}"
+        )  # Changed to warning
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
-        LOG.info("OPTIMIZE operation completed.")
+        LOG.warning("OPTIMIZE operation completed.")  # Changed to warning
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From 414558816cad0b94f0fced799624db30ec9dca9e Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:49:40 +0200
Subject: [PATCH 27/87] fix: Improve logging and error handling in
 DeltaLakeCallback writer method

---
 cryptofeed/backends/deltalake.py | 61 ++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 68250f6c8..81f6f34b8 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -13,9 +13,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 # Add these lines after the imports
@@ -109,23 +123,32 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
-        LOG.warning("Writer method called")  # Changed to warning
+        LOG.warning("Writer method started")
         while self.running:
-            async with self.read_queue() as updates:
-                LOG.warning(
-                    f"Read queue returned {len(updates)} updates"
-                )  # Changed to warning
-                if updates:
-                    LOG.warning(
-                        f"Received {len(updates)} updates for processing."
-                    )  # Changed to warning
-
-                    df = pd.DataFrame(updates)
-
-                    self._transform_columns(df)
-                    self._validate_columns(df)
-
-                    await self._write_batch(df)
+            try:
+                async with self.read_queue() as updates:
+                    LOG.warning(f"Read queue returned {len(updates)} updates")
+                    if updates:
+                        LOG.warning(f"Received {len(updates)} updates for processing.")
+                        df = pd.DataFrame(updates)
+                        LOG.warning(f"Created DataFrame with shape: {df.shape}")
+
+                        LOG.warning("Starting field transformation")
+                        self._transform_fields(df)
+                        LOG.warning("Field transformation completed")
+
+                        LOG.warning("Validating columns")
+                        self._validate_columns(df)
+                        LOG.warning("Columns validation completed")
+
+                        LOG.warning("Starting batch write")
+                        await self._write_batch(df)
+                        LOG.warning("Batch write completed")
+                    else:
+                        LOG.warning("No updates received, continuing loop")
+            except Exception as e:
+                LOG.error(f"Error in writer method: {e}", exc_info=True)
+        LOG.warning("Writer method ended")
 
     def _validate_columns(self, df: pd.DataFrame):
         LOG.debug("Validating DataFrame columns.")

From 274ca48695b4da3df9e4475163f14e8548e13b26 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:59:06 +0200
Subject: [PATCH 28/87] fix: Refactor field transformation in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 81f6f34b8..0ce9b9116 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -13,23 +13,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 # Add these lines after the imports
@@ -134,7 +120,7 @@ async def writer(self):
                         LOG.warning(f"Created DataFrame with shape: {df.shape}")
 
                         LOG.warning("Starting field transformation")
-                        self._transform_fields(df)
+                        self._transform_columns(df)
                         LOG.warning("Field transformation completed")
 
                         LOG.warning("Validating columns")

From 43bbd3ad63c0258edb6fae5f981625b8008fef4d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 05:15:11 +0200
Subject: [PATCH 29/87] fix: Reorder columns to prioritize exchange and symbol

---
 cryptofeed/backends/deltalake.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 0ce9b9116..314bae1bf 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -62,7 +62,6 @@ def __init__(
             self._convert_int_columns,
             self._ensure_partition_columns,
             self._handle_missing_values,
-            self._reorder_columns,
         ]
         if custom_transformations:
             self.transformations.extend(custom_transformations)
@@ -179,10 +178,9 @@ def _rename_custom_columns(self, df: pd.DataFrame):
 
     def _reorder_columns(self, df: pd.DataFrame):
         LOG.debug("Reordering columns to prioritize exchange and symbol.")
-        cols = ["exchange", "symbol"] + [
-            col for col in df.columns if col not in ["exchange", "symbol"]
-        ]
-        df.reindex(columns=cols, inplace=True)
+        priority_cols = ["exchange", "symbol"]
+        other_cols = [col for col in df.columns if col not in priority_cols]
+        df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
         LOG.debug("Converting datetime columns.")

From e59f57dbe389f4e12d9d2ad6e57a475febd63354 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 05:22:38 +0200
Subject: [PATCH 30/87] fix: Ensure datetime columns have millisecond precision
 in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 314bae1bf..ee7027884 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -155,14 +155,18 @@ def _validate_columns(self, df: pd.DataFrame):
             "exchange": "object",
             "symbol": "object",
             "dt": "object",
-            "timestamp": "datetime64[ms]",
-            "receipt_timestamp": "datetime64[ms]",
+            "timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
+            "receipt_timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
         }
         for col, expected_type in expected_types.items():
-            if col in df.columns and not df[col].dtype == expected_type:
-                raise TypeError(
-                    f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
-                )
+            if col in df.columns:
+                if expected_type.startswith("datetime64"):
+                    # Convert to millisecond precision if it's a datetime column
+                    df[col] = df[col].astype('datetime64[ms]')
+                if not df[col].dtype == expected_type:
+                    raise TypeError(
+                        f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
+                    )
 
         LOG.debug("DataFrame columns validation completed successfully.")
 
@@ -187,7 +191,8 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                df[col] = pd.to_datetime(df[col], unit="ms")
+                # Convert to millisecond precision
+                df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]')
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         if "timestamp" in df.columns:

From 4bacc42ecc377a5a4d3301a3dc35692bd59cfb09 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 05:39:57 +0200
Subject: [PATCH 31/87] feat: Ensure datetime columns are in millisecond
 precision

---
 cryptofeed/backends/deltalake.py | 95 ++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 35 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index ee7027884..f3b8eff50 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame):
             "exchange": "object",
             "symbol": "object",
             "dt": "object",
-            "timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
-            "receipt_timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
+            "timestamp": "datetime64[ms]",
+            "receipt_timestamp": "datetime64[ms]",
         }
         for col, expected_type in expected_types.items():
             if col in df.columns:
-                if expected_type.startswith("datetime64"):
-                    # Convert to millisecond precision if it's a datetime column
-                    df[col] = df[col].astype('datetime64[ms]')
+                if expected_type == "datetime64[ms]":
+                    # Ensure datetime columns are in millisecond precision
+                    df[col] = df[col].astype("datetime64[ms]")
                 if not df[col].dtype == expected_type:
                     raise TypeError(
                         f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
@@ -187,22 +187,47 @@ def _reorder_columns(self, df: pd.DataFrame):
         df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime columns.")
+        LOG.debug("Converting datetime columns to millisecond precision.")
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                # Convert to millisecond precision
-                df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]')
+                # Log sample of original values
+                LOG.warning(
+                    f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
+                )
+                # Convert to millisecond precision, handling both string and datetime inputs
+                df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]")
+                # Log sample of converted values in readable format
+                if len(df) > 0:
+                    readable_time = (
+                        df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+                    )
+                    LOG.warning(f"Sample {col} after conversion: {readable_time}")
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
+        min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+            df["dt"] = (
+                df["timestamp"]
+                .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now())
+                .dt.strftime("%Y-%m-%d")
+            )
         elif "receipt_timestamp" in df.columns:
-            df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d")
+            df["dt"] = (
+                df["receipt_timestamp"]
+                .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now())
+                .dt.strftime("%Y-%m-%d")
+            )
         else:
             LOG.warning("No timestamp column found. Using current date for 'dt'.")
             df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d")
 
+        # Log sample of 'dt' column
+        if "dt" in df.columns and len(df) > 0:
+            LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}")
+
+        LOG.debug("Datetime columns converted to millisecond precision.")
+
     def _convert_int_columns(self, df: pd.DataFrame):
         LOG.debug("Converting integer columns.")
         int_columns = ["id", "trade_id", "trades"]
@@ -363,8 +388,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRADES
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -381,14 +406,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FUNDING
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
     - mark_price: float64 (nullable)
     - rate: float64
-    - next_funding_time: datetime64[ns] (nullable)
+    - next_funding_time: datetime64[ms] (nullable)
     - predicted_rate: float64 (nullable)
     """
 
@@ -397,8 +422,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TICKER
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -411,8 +436,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = OPEN_INTEREST
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -424,8 +449,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = LIQUIDATIONS
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -441,8 +466,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     default_key = "book"
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -461,13 +486,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
-    - start: datetime64[ns]
-    - stop: datetime64[ns]
+    - start: datetime64[ms]
+    - stop: datetime64[ms]
     - interval: string
     - trades: int64 (nullable)
     - open: float64
@@ -483,8 +508,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = ORDER_INFO
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -504,8 +529,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRANSACTIONS
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - currency: category
@@ -519,8 +544,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = BALANCES
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - currency: category
@@ -533,8 +558,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FILLS
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category

From d520c0269b532fb23a64af27c8180e2ec681be8e Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 00:42:34 +0200
Subject: [PATCH 32/87] chore: Convert datetime columns to microsecond
 precision

---
 .gitignore                       |  1 +
 cryptofeed/backends/deltalake.py | 72 ++++++++++++++++----------------
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/.gitignore b/.gitignore
index ac64f2b9e..ed13eb5fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,4 @@ ENV/
 # PyCharm
 .idea/
 .aider*
+.trunk/
diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index f3b8eff50..af5d6b31e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame):
             "exchange": "object",
             "symbol": "object",
             "dt": "object",
-            "timestamp": "datetime64[ms]",
-            "receipt_timestamp": "datetime64[ms]",
+            "timestamp": "datetime64[us]",
+            "receipt_timestamp": "datetime64[us]",
         }
         for col, expected_type in expected_types.items():
             if col in df.columns:
-                if expected_type == "datetime64[ms]":
-                    # Ensure datetime columns are in millisecond precision
-                    df[col] = df[col].astype("datetime64[ms]")
+                if expected_type == "datetime64[us]":
+                    # Ensure datetime columns are in microsecond precision
+                    df[col] = df[col].astype("datetime64[us]")
                 if not df[col].dtype == expected_type:
                     raise TypeError(
                         f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
@@ -187,7 +187,7 @@ def _reorder_columns(self, df: pd.DataFrame):
         df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime columns to millisecond precision.")
+        LOG.debug("Converting datetime columns to microsecond precision.")
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
@@ -195,13 +195,11 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
                 LOG.warning(
                     f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
                 )
-                # Convert to millisecond precision, handling both string and datetime inputs
-                df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]")
+                # Convert to microsecond precision, handling both string and datetime inputs
+                df[col] = pd.to_datetime(df[col]).astype("datetime64[us]")
                 # Log sample of converted values in readable format
                 if len(df) > 0:
-                    readable_time = (
-                        df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
-                    )
+                    readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")
                     LOG.warning(f"Sample {col} after conversion: {readable_time}")
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
@@ -226,7 +224,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         if "dt" in df.columns and len(df) > 0:
             LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}")
 
-        LOG.debug("Datetime columns converted to millisecond precision.")
+        LOG.debug("Datetime columns converted to microsecond precision.")
 
     def _convert_int_columns(self, df: pd.DataFrame):
         LOG.debug("Converting integer columns.")
@@ -388,8 +386,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRADES
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -406,14 +404,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FUNDING
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
     - mark_price: float64 (nullable)
     - rate: float64
-    - next_funding_time: datetime64[ms] (nullable)
+    - next_funding_time: datetime64[us] (nullable)
     - predicted_rate: float64 (nullable)
     """
 
@@ -422,8 +420,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TICKER
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -436,8 +434,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = OPEN_INTEREST
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -449,8 +447,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = LIQUIDATIONS
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -466,8 +464,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     default_key = "book"
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -486,13 +484,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
-    - start: datetime64[ms]
-    - stop: datetime64[ms]
+    - start: datetime64[us]
+    - stop: datetime64[us]
     - interval: string
     - trades: int64 (nullable)
     - open: float64
@@ -508,8 +506,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = ORDER_INFO
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -529,8 +527,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRANSACTIONS
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - currency: category
@@ -544,8 +542,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = BALANCES
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - currency: category
@@ -558,8 +556,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FILLS
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category

From b11f9a4494f66cb19c9e556082fe09b3f3ea96e0 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 01:04:50 +0200
Subject: [PATCH 33/87] fix: Change log levels from warning to debug for
 non-critical messages

---
 cryptofeed/backends/deltalake.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index af5d6b31e..412794626 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -41,7 +41,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
-        LOG.warning("Initializing DeltaLakeCallback")  # Changed to warning
+        LOG.warning("Initializing DeltaLakeCallback")
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -192,7 +192,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         for col in datetime_columns:
             if col in df.columns:
                 # Log sample of original values
-                LOG.warning(
+                LOG.debug(
                     f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
                 )
                 # Convert to microsecond precision, handling both string and datetime inputs
@@ -200,7 +200,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
                 # Log sample of converted values in readable format
                 if len(df) > 0:
                     readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")
-                    LOG.warning(f"Sample {col} after conversion: {readable_time}")
+                    LOG.debug(f"Sample {col} after conversion: {readable_time}")
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
@@ -222,7 +222,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
 
         # Log sample of 'dt' column
         if "dt" in df.columns and len(df) > 0:
-            LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}")
+            LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}")
 
         LOG.debug("Datetime columns converted to microsecond precision.")
 
@@ -293,7 +293,7 @@ def _handle_missing_values(self, df: pd.DataFrame):
     async def _write_batch(self, df: pd.DataFrame):
         LOG.warning(
             f"_write_batch called with DataFrame of shape {df.shape}"
-        )  # Changed to warning
+        )
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
@@ -305,12 +305,12 @@ async def _write_batch(self, df: pd.DataFrame):
             try:
                 LOG.warning(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
-                )  # Changed to warning
-                LOG.warning(f"DataFrame schema:\n{df.dtypes}")  # Changed to warning
+                )
+                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
                 LOG.warning(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
-                )  # Changed to warning
+                )
 
                 write_deltalake(
                     self.delta_table_path,
@@ -328,7 +328,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 if self.time_travel:
                     self._update_metadata()
 
-                LOG.warning("Batch write successful.")  # Changed to warning
+                LOG.warning("Batch write successful.")
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
@@ -341,7 +341,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 if attempt < max_retries - 1:
                     LOG.warning(
                         f"Retrying in {retry_delay} seconds..."
-                    )  # Changed to warning
+                    )
                     await asyncio.sleep(retry_delay)
                 else:
                     LOG.error(
@@ -351,12 +351,12 @@ async def _write_batch(self, df: pd.DataFrame):
     async def _optimize_table(self):
         LOG.warning(
             f"Running OPTIMIZE on table {self.delta_table_path}"
-        )  # Changed to warning
+        )
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
-        LOG.warning("OPTIMIZE operation completed.")  # Changed to warning
+        LOG.warning("OPTIMIZE operation completed.")
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From 11893dbfc66171e42f98e8bbc081601f7f11ca5b Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 01:25:25 +0200
Subject: [PATCH 34/87] refactor: Simplify datetime column handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 40 +++++++++++++-------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 412794626..d11c3c596 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -205,20 +205,12 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
         if "timestamp" in df.columns:
-            df["dt"] = (
-                df["timestamp"]
-                .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now())
-                .dt.strftime("%Y-%m-%d")
-            )
+            df["dt"] = df["timestamp"].where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date
         elif "receipt_timestamp" in df.columns:
-            df["dt"] = (
-                df["receipt_timestamp"]
-                .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now())
-                .dt.strftime("%Y-%m-%d")
-            )
+            df["dt"] = df["receipt_timestamp"].where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date
         else:
             LOG.warning("No timestamp column found. Using current date for 'dt'.")
-            df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d")
+            df["dt"] = pd.Timestamp.now().date()
 
         # Log sample of 'dt' column
         if "dt" in df.columns and len(df) > 0:
@@ -244,7 +236,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame):
                 elif col == "dt":
                     # 'dt' should already be created in _convert_datetime_columns
                     LOG.warning("'dt' column not found. This should not happen.")
-                    df[col] = pd.Timestamp.now().strftime("%Y-%m-%d")
+                    df[col] = pd.Timestamp.now().date()
                 else:
                     df[col] = "unknown"
 
@@ -256,7 +248,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame):
                 df[col] = df[col].fillna(
                     "unknown"
                     if col != "dt"
-                    else pd.Timestamp.now().strftime("%Y-%m-%d")
+                    else pd.Timestamp.now().date()
                 )
 
     def _handle_missing_values(self, df: pd.DataFrame):
@@ -388,7 +380,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - id: int64 (nullable)
@@ -406,7 +398,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - mark_price: float64 (nullable)
@@ -422,7 +414,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - bid: float64
@@ -436,7 +428,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - open_interest: float64
@@ -449,7 +441,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - side: category
@@ -466,7 +458,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
@@ -486,7 +478,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - start: datetime64[us]
@@ -508,7 +500,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - id: int64
@@ -529,7 +521,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - currency: category
     - type: category
@@ -544,7 +536,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - currency: category
     - balance: float64
@@ -558,7 +550,7 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - price: float64

From 313a8f829c2ffc012be96e810b804f6250a50d34 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 02:05:38 +0200
Subject: [PATCH 35/87] feat: Add batch processing and flush interval to
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 53 +++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d11c3c596..5bfa7bd24 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import logging
+import time
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
 
@@ -38,6 +39,8 @@ def __init__(
         storage_options: Optional[Dict[str, Any]] = None,
         numeric_type: Union[type, str] = float,
         none_to: Any = None,
+        batch_size: int = 1000,
+        flush_interval: float = 60.0,
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
@@ -67,6 +70,10 @@ def __init__(
             self.transformations.extend(custom_transformations)
         # Validate configuration parameters
         self._validate_configuration()
+        self.batch_size = batch_size
+        self.flush_interval = flush_interval
+        self.batch = []
+        self.last_flush_time = time.time()
 
     def _validate_configuration(self):
         if self.optimize_interval <= 0:
@@ -115,26 +122,41 @@ async def writer(self):
                     LOG.warning(f"Read queue returned {len(updates)} updates")
                     if updates:
                         LOG.warning(f"Received {len(updates)} updates for processing.")
-                        df = pd.DataFrame(updates)
-                        LOG.warning(f"Created DataFrame with shape: {df.shape}")
+                        self.batch.extend(updates)
 
-                        LOG.warning("Starting field transformation")
-                        self._transform_columns(df)
-                        LOG.warning("Field transformation completed")
-
-                        LOG.warning("Validating columns")
-                        self._validate_columns(df)
-                        LOG.warning("Columns validation completed")
-
-                        LOG.warning("Starting batch write")
-                        await self._write_batch(df)
-                        LOG.warning("Batch write completed")
+                        if len(self.batch) >= self.batch_size or (time.time() - self.last_flush_time) >= self.flush_interval:
+                            await self._process_batch()
                     else:
-                        LOG.warning("No updates received, continuing loop")
+                        # Check if we need to flush based on time
+                        if (time.time() - self.last_flush_time) >= self.flush_interval and self.batch:
+                            await self._process_batch()
+                        else:
+                            LOG.warning("No updates received, continuing loop")
+                            await asyncio.sleep(1)  # Add a small delay to prevent busy-waiting
             except Exception as e:
                 LOG.error(f"Error in writer method: {e}", exc_info=True)
         LOG.warning("Writer method ended")
 
+    async def _process_batch(self):
+        LOG.warning(f"Processing batch of {len(self.batch)} updates")
+        df = pd.DataFrame(self.batch)
+        LOG.warning(f"Created DataFrame with shape: {df.shape}")
+
+        LOG.warning("Starting field transformation")
+        self._transform_columns(df)
+        LOG.warning("Field transformation completed")
+
+        LOG.warning("Validating columns")
+        self._validate_columns(df)
+        LOG.warning("Columns validation completed")
+
+        LOG.warning("Starting batch write")
+        await self._write_batch(df)
+        LOG.warning("Batch write completed")
+
+        self.batch = []
+        self.last_flush_time = time.time()
+
     def _validate_columns(self, df: pd.DataFrame):
         LOG.debug("Validating DataFrame columns.")
         # Check for required columns
@@ -357,6 +379,9 @@ def _update_metadata(self):
     async def stop(self):
         LOG.info("Stopping DeltaLakeCallback writer.")
         self.running = False
+        # Flush any remaining data
+        if self.batch:
+            await self._process_batch()
 
     def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
         if self.time_travel:

From f5d3f8c38101ae4f7e0f8a0a943e3d29bd2427f8 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 02:25:43 +0200
Subject: [PATCH 36/87] fix: Update copyright year in demo_deltalake.py

---
 examples/demo_deltalake.py | 4 ++--
 setup.py                   | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py
index 5c973016e..95d2a405d 100644
--- a/examples/demo_deltalake.py
+++ b/examples/demo_deltalake.py
@@ -1,9 +1,9 @@
-'''
+"""
 Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com
 
 Please see the LICENSE file for the terms and conditions
 associated with this software.
-'''
+"""
 from cryptofeed import FeedHandler
 from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake
 from cryptofeed.defines import FUNDING, TICKER, TRADES
diff --git a/setup.py b/setup.py
index 344573348..fe38f7b7d 100644
--- a/setup.py
+++ b/setup.py
@@ -7,10 +7,9 @@
 import os
 import sys
 
-from setuptools import Extension, setup
-from setuptools import find_packages
-from setuptools.command.test import test as TestCommand
 from Cython.Build import cythonize
+from setuptools import Extension, find_packages, setup
+from setuptools.command.test import test as TestCommand
 
 
 def get_long_description():

From 85ad6681100035761d38c0cf126da2725940a663 Mon Sep 17 00:00:00 2001
From: Tommy K <tommy.k.devops@gmail.com>
Date: Sat, 31 Aug 2024 23:37:59 +0200
Subject: [PATCH 37/87] feat(deltalake): Implement Delta Lake backend and add
 dependencies

- Add DeltaLakeCallback class with support for various data types
- Implement partitioning, Z-ordering, and time travel features
- Add schema documentation for each data type
- Include Delta Lake dependencies in setup.py
- Create demo file for Delta Lake usage with S3 configuration
- Update extras_require in setup.py to include deltalake option
---
 cryptofeed/backends/deltalake.py | 328 +++++++++++++++++++++++++++++++
 examples/demo_deltalake.py       |  54 +++++
 setup.py                         |   2 +
 3 files changed, 384 insertions(+)
 create mode 100644 cryptofeed/backends/deltalake.py
 create mode 100644 examples/demo_deltalake.py

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
new file mode 100644
index 000000000..1fd08e555
--- /dev/null
+++ b/cryptofeed/backends/deltalake.py
@@ -0,0 +1,328 @@
+'''
+Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com
+
+Please see the LICENSE file for the terms and conditions
+associated with this software.
+'''
+from typing import Optional, List, Dict, Any
+import logging
+import pandas as pd
+from deltalake import DeltaTable, write_deltalake
+
+from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback
+from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK
+
+LOG = logging.getLogger('feedhandler')
+
+class DeltaLakeCallback(BackendQueue):
+    def __init__(self, 
+                 base_path: str, 
+                 key: Optional[str] = None, 
+                 custom_columns: Optional[Dict[str, str]] = None,
+                 partition_cols: Optional[List[str]] = None,
+                 optimize_interval: int = 100,
+                 z_order_cols: Optional[List[str]] = None,
+                 time_travel: bool = False,
+                 storage_options: Optional[Dict[str, Any]] = None,
+                 **kwargs: Any):
+        super().__init__()
+        self.key = key or self.default_key
+        self.base_path = base_path
+        self.delta_table_path = f"{self.base_path}/{self.key}"
+        self.custom_columns = custom_columns or {}
+        self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day']
+        self.optimize_interval = optimize_interval
+        self.z_order_cols = z_order_cols or self._default_z_order_cols()
+        self.time_travel = time_travel
+        self.storage_options = storage_options or {}
+        self.write_count = 0
+        self.running = True
+        
+        if optimize_interval <= 0:
+            raise ValueError("optimize_interval must be a positive integer")
+
+        if not isinstance(self.partition_cols, list):
+            raise TypeError("partition_cols must be a list of strings")
+
+        if not isinstance(self.z_order_cols, list):
+            raise TypeError("z_order_cols must be a list of strings")
+
+    def _default_z_order_cols(self) -> List[str]:
+        common_cols = ['exchange', 'symbol', 'timestamp']
+        data_specific_cols = {
+            TRADES: ['price', 'amount'],
+            FUNDING: ['rate'],
+            TICKER: ['bid', 'ask'],
+            OPEN_INTEREST: ['open_interest'],
+            LIQUIDATIONS: ['quantity', 'price'],
+            BOOK: [],  # Book data is typically queried by timestamp and symbol
+            CANDLES: ['open', 'close', 'high', 'low'],
+            ORDER_INFO: ['status', 'price', 'amount'],
+            TRANSACTIONS: ['type', 'amount'],
+            BALANCES: ['balance'],
+            FILLS: ['price', 'amount']
+        }
+        return common_cols + data_specific_cols.get(self.key, [])
+
+    async def writer(self):
+        while self.running:
+            async with self.read_queue() as updates:
+                if updates:
+                    df = pd.DataFrame(updates)
+                    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
+                    df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s')
+                    df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day
+                    
+                    # Reorder columns to put exchange and symbol first
+                    cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']]
+                    df = df[cols]
+                    
+                    if self.custom_columns:
+                        df = df.rename(columns=self.custom_columns)
+                    
+                    await self._write_batch(df)
+
+    async def _write_batch(self, df: pd.DataFrame):
+        if df.empty:
+            return
+
+        try:
+            LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
+            write_deltalake(
+                self.delta_table_path,
+                df,
+                mode="append",
+                partition_by=self.partition_cols,
+                schema_mode="merge",
+                storage_options=self.storage_options
+            )
+            self.write_count += 1
+
+            if self.write_count % self.optimize_interval == 0:
+                await self._optimize_table()
+
+            if self.time_travel:
+                self._update_metadata()
+
+        except Exception as e:
+            LOG.error(f"Error writing to Delta Lake: {e}")
+
+    async def _optimize_table(self):
+        LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
+        dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
+        dt.optimize.compact()
+        if self.z_order_cols:
+            dt.optimize.z_order(self.z_order_cols)
+
+    def _update_metadata(self):
+        dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
+        LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}")
+
+    async def stop(self):
+        self.running = False
+
+    def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
+        if self.time_travel:
+            dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
+            if timestamp:
+                return dt.version_at_timestamp(timestamp)
+            else:
+                return dt.version()
+        else:
+            LOG.warning("Time travel is not enabled for this table")
+            return None
+
+class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = TRADES
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - id: string (nullable)
+    - side: string
+    - amount: float64
+    - price: float64
+    - type: string (nullable)
+    """
+
+class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = FUNDING
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - mark_price: float64 (nullable)
+    - rate: float64
+    - next_funding_time: datetime64[ns] (nullable)
+    - predicted_rate: float64 (nullable)
+    """
+
+class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = TICKER
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - bid: float64
+    - ask: float64
+    """
+
+class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = OPEN_INTEREST
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - open_interest: float64
+    """
+
+class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = LIQUIDATIONS
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - side: string
+    - quantity: float64
+    - price: float64
+    - id: string
+    - status: string
+    """
+
+class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
+    default_key = BOOK
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - delta: dict (nullable, contains 'bid' and 'ask' updates)
+    - book: dict (contains full order book snapshot when available)
+    """
+
+class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = CANDLES
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - start: datetime64[ns]
+    - stop: datetime64[ns]
+    - interval: string
+    - trades: int64 (nullable)
+    - open: float64
+    - close: float64
+    - high: float64
+    - low: float64
+    - volume: float64
+    - closed: bool (nullable)
+    """
+
+class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = ORDER_INFO
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - id: string
+    - client_order_id: string (nullable)
+    - side: string
+    - status: string
+    - type: string
+    - price: float64
+    - amount: float64
+    - remaining: float64 (nullable)
+    - account: string (nullable)
+    """
+
+class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = TRANSACTIONS
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - currency: string
+    - type: string
+    - status: string
+    - amount: float64
+    """
+
+class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = BALANCES
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - currency: string
+    - balance: float64
+    - reserved: float64 (nullable)
+    """
+
+class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
+    default_key = FILLS
+    """
+    Schema:
+    - timestamp: datetime64[ns] (from 'date' column)
+    - receipt_timestamp: datetime64[ns]
+    - year: int32
+    - month: int32
+    - day: int32
+    - exchange: string
+    - symbol: string
+    - price: float64
+    - amount: float64
+    - side: string
+    - fee: float64 (nullable)
+    - id: string
+    - order_id: string
+    - liquidity: string
+    - type: string
+    - account: string (nullable)
+    """
\ No newline at end of file
diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py
new file mode 100644
index 000000000..5c973016e
--- /dev/null
+++ b/examples/demo_deltalake.py
@@ -0,0 +1,54 @@
+'''
+Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com
+
+Please see the LICENSE file for the terms and conditions
+associated with this software.
+'''
+from cryptofeed import FeedHandler
+from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake
+from cryptofeed.defines import FUNDING, TICKER, TRADES
+from cryptofeed.exchanges import Binance
+
+
+def main():
+    f = FeedHandler()
+    
+    # Define the Delta Lake base path (can be local or S3)
+    delta_base_path = 's3://your-bucket/path/to/delta/tables'
+
+    # S3 storage options (remove if using local storage)
+    s3_options = {
+        "AWS_ACCESS_KEY_ID": "your_access_key",
+        "AWS_SECRET_ACCESS_KEY": "your_secret_key",
+        "AWS_REGION": "your_region"
+    }
+
+    # Add Binance feed with Delta Lake callbacks
+    f.add_feed(Binance(
+        channels=[TRADES, FUNDING, TICKER],
+        symbols=['BTC-USDT', 'ETH-USDT'],
+        callbacks={
+            TRADES: TradeDeltaLake(
+                base_path=delta_base_path, 
+                optimize_interval=50,  # More frequent table optimization
+                time_travel=True,  # Enable time travel feature
+                storage_options=s3_options  # Add S3 configuration
+            ),
+            FUNDING: FundingDeltaLake(
+                base_path=delta_base_path,
+                storage_options=s3_options  # Add S3 configuration
+            ),
+            TICKER: TickerDeltaLake(
+                base_path=delta_base_path,
+                partition_cols=['exchange', 'symbol', 'year', 'month', 'day'],  # Custom partitioning
+                z_order_cols=['timestamp', 'bid', 'ask'],  # Enable Z-ordering
+                storage_options=s3_options  # Add S3 configuration
+            )
+        }
+    ))
+    
+    f.run()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/setup.py b/setup.py
index adf10f870..344573348 100644
--- a/setup.py
+++ b/setup.py
@@ -95,6 +95,7 @@ def run_tests(self):
         "rabbit": ["aio_pika", "pika"],
         "redis": ["hiredis", "redis>=4.5.1"],
         "zmq": ["pyzmq"],
+        "deltalake": ["deltalake>=0.6.1", "pandas"],
         "all": [
             "arctic",
             "google_cloud_pubsub>=2.4.1",
@@ -107,6 +108,7 @@ def run_tests(self):
             "hiredis",
             "redis>=4.5.1",
             "pyzmq",
+            "deltalake>=0.6.1",
         ],
     },
 )

From b7a20b93dadf45356b0c489f7290d417a24643ae Mon Sep 17 00:00:00 2001
From: Tommy K <tommy.k.devops@gmail.com>
Date: Sun, 1 Sep 2024 00:11:55 +0200
Subject: [PATCH 38/87] feat(deltalake): optimize Delta Lake implementation

---
 cryptofeed/backends/deltalake.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 1fd08e555..bd29037ae 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -5,7 +5,9 @@
 associated with this software.
 '''
 from typing import Optional, List, Dict, Any
+from collections import defaultdict
 import logging
+
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
@@ -228,6 +230,11 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
     - book: dict (contains full order book snapshot when available)
     """
+    def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs):
+        self.snapshots_only = snapshots_only
+        self.snapshot_interval = snapshot_interval
+        self.snapshot_count = defaultdict(int)
+        super().__init__(*args, **kwargs)
 
 class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES

From 6f81da62365c52d6a5bf6cd9402004843756844d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 00:51:23 +0200
Subject: [PATCH 39/87] fix(deltalake): fix book table name

---
 cryptofeed/backends/deltalake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index bd29037ae..c55475f2c 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -12,7 +12,7 @@
 from deltalake import DeltaTable, write_deltalake
 
 from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback
-from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS, BOOK
+from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS
 
 LOG = logging.getLogger('feedhandler')
 
@@ -217,7 +217,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     """
 
 class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
-    default_key = BOOK
+    default_key = "book"
     """
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)

From d12441b27a4bb17e40a831952cfa91f90be8e57d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 01:01:48 +0200
Subject: [PATCH 40/87] fix(deltalake): Fix book name

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c55475f2c..c1b882c34 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -57,7 +57,7 @@ def _default_z_order_cols(self) -> List[str]:
             TICKER: ['bid', 'ask'],
             OPEN_INTEREST: ['open_interest'],
             LIQUIDATIONS: ['quantity', 'price'],
-            BOOK: [],  # Book data is typically queried by timestamp and symbol
+            "book": [],  # Book data is typically queried by timestamp and symbol
             CANDLES: ['open', 'close', 'high', 'low'],
             ORDER_INFO: ['status', 'price', 'amount'],
             TRANSACTIONS: ['type', 'amount'],

From d6a2ae8edba04df49e3442d64c0b48cb311d63eb Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 01:21:33 +0200
Subject: [PATCH 41/87] fix(deltalake): Fix numeric type

---
 cryptofeed/backends/deltalake.py | 35 ++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c1b882c34..fda44563e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -4,9 +4,10 @@
 Please see the LICENSE file for the terms and conditions
 associated with this software.
 '''
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Union
 from collections import defaultdict
 import logging
+import numpy as np
 
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
@@ -16,16 +17,19 @@
 
 LOG = logging.getLogger('feedhandler')
 
+
 class DeltaLakeCallback(BackendQueue):
-    def __init__(self, 
-                 base_path: str, 
-                 key: Optional[str] = None, 
+    def __init__(self,
+                 base_path: str,
+                 key: Optional[str] = None,
                  custom_columns: Optional[Dict[str, str]] = None,
                  partition_cols: Optional[List[str]] = None,
                  optimize_interval: int = 100,
                  z_order_cols: Optional[List[str]] = None,
                  time_travel: bool = False,
                  storage_options: Optional[Dict[str, Any]] = None,
+                 numeric_type: Union[type, str] = float,
+                 none_to: Any = None,
                  **kwargs: Any):
         super().__init__()
         self.key = key or self.default_key
@@ -49,6 +53,9 @@ def __init__(self,
         if not isinstance(self.z_order_cols, list):
             raise TypeError("z_order_cols must be a list of strings")
 
+        self.numeric_type = numeric_type
+        self.none_to = none_to
+
     def _default_z_order_cols(self) -> List[str]:
         common_cols = ['exchange', 'symbol', 'timestamp']
         data_specific_cols = {
@@ -89,6 +96,15 @@ async def _write_batch(self, df: pd.DataFrame):
             return
 
         try:
+            # Convert numeric columns to the specified numeric type
+            numeric_columns = df.select_dtypes(include=[np.number]).columns
+            for col in numeric_columns:
+                df[col] = df[col].astype(self.numeric_type)
+
+            # Replace None values with the specified value
+            if self.none_to is not None:
+                df = df.fillna(self.none_to)
+
             LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
             write_deltalake(
                 self.delta_table_path,
@@ -134,6 +150,7 @@ def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
             LOG.warning("Time travel is not enabled for this table")
             return None
 
+
 class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRADES
     """
@@ -152,6 +169,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     - type: string (nullable)
     """
 
+
 class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FUNDING
     """
@@ -169,6 +187,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     - predicted_rate: float64 (nullable)
     """
 
+
 class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TICKER
     """
@@ -184,6 +203,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     - ask: float64
     """
 
+
 class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = OPEN_INTEREST
     """
@@ -198,6 +218,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     - open_interest: float64
     """
 
+
 class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = LIQUIDATIONS
     """
@@ -216,6 +237,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     - status: string
     """
 
+
 class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     default_key = "book"
     """
@@ -236,6 +258,7 @@ def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs
         self.snapshot_count = defaultdict(int)
         super().__init__(*args, **kwargs)
 
+
 class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES
     """
@@ -259,6 +282,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     - closed: bool (nullable)
     """
 
+
 class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = ORDER_INFO
     """
@@ -281,6 +305,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     - account: string (nullable)
     """
 
+
 class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRANSACTIONS
     """
@@ -297,6 +322,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     - amount: float64
     """
 
+
 class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = BALANCES
     """
@@ -312,6 +338,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     - reserved: float64 (nullable)
     """
 
+
 class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FILLS
     """

From e257d83cb942b96385107c234733d9bac55b4832 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 17:26:24 +0200
Subject: [PATCH 42/87] fix: Ensure timestamp columns have nanosecond precision
 in DeltaLake backend

---
 .gitignore                       |   1 +
 cryptofeed/backends/deltalake.py | 109 +++++++++++++++++++------------
 2 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5860625f7..ac64f2b9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,4 @@ ENV/
 
 # PyCharm
 .idea/
+.aider*
diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index fda44563e..963488b29 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -1,49 +1,60 @@
-'''
+"""
 Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com
 
 Please see the LICENSE file for the terms and conditions
 associated with this software.
-'''
-from typing import Optional, List, Dict, Any, Union
-from collections import defaultdict
+"""
+
 import logging
-import numpy as np
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
 
+import numpy as np
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendQueue, BackendBookCallback, BackendCallback
-from cryptofeed.defines import BALANCES, CANDLES, FILLS, FUNDING, OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, LIQUIDATIONS, TRANSACTIONS
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+
 
-LOG = logging.getLogger('feedhandler')
+LOG = logging.getLogger("feedhandler")
 
 
 class DeltaLakeCallback(BackendQueue):
-    def __init__(self,
-                 base_path: str,
-                 key: Optional[str] = None,
-                 custom_columns: Optional[Dict[str, str]] = None,
-                 partition_cols: Optional[List[str]] = None,
-                 optimize_interval: int = 100,
-                 z_order_cols: Optional[List[str]] = None,
-                 time_travel: bool = False,
-                 storage_options: Optional[Dict[str, Any]] = None,
-                 numeric_type: Union[type, str] = float,
-                 none_to: Any = None,
-                 **kwargs: Any):
+    def __init__(
+        self,
+        base_path: str,
+        key: Optional[str] = None,
+        custom_columns: Optional[Dict[str, str]] = None,
+        partition_cols: Optional[List[str]] = None,
+        optimize_interval: int = 100,
+        z_order_cols: Optional[List[str]] = None,
+        time_travel: bool = False,
+        storage_options: Optional[Dict[str, Any]] = None,
+        numeric_type: Union[type, str] = float,
+        none_to: Any = None,
+        **kwargs: Any,
+    ):
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
         self.delta_table_path = f"{self.base_path}/{self.key}"
         self.custom_columns = custom_columns or {}
-        self.partition_cols = partition_cols or ['exchange', 'symbol', 'year', 'month', 'day']
+        self.partition_cols = partition_cols or [
+            "exchange",
+            "symbol",
+            "year",
+            "month",
+            "day",
+        ]
         self.optimize_interval = optimize_interval
         self.z_order_cols = z_order_cols or self._default_z_order_cols()
         self.time_travel = time_travel
         self.storage_options = storage_options or {}
         self.write_count = 0
         self.running = True
-        
+
         if optimize_interval <= 0:
             raise ValueError("optimize_interval must be a positive integer")
 
@@ -57,19 +68,19 @@ def __init__(self,
         self.none_to = none_to
 
     def _default_z_order_cols(self) -> List[str]:
-        common_cols = ['exchange', 'symbol', 'timestamp']
+        common_cols = ["exchange", "symbol", "timestamp"]
         data_specific_cols = {
-            TRADES: ['price', 'amount'],
-            FUNDING: ['rate'],
-            TICKER: ['bid', 'ask'],
-            OPEN_INTEREST: ['open_interest'],
-            LIQUIDATIONS: ['quantity', 'price'],
+            TRADES: ["price", "amount"],
+            FUNDING: ["rate"],
+            TICKER: ["bid", "ask"],
+            OPEN_INTEREST: ["open_interest"],
+            LIQUIDATIONS: ["quantity", "price"],
             "book": [],  # Book data is typically queried by timestamp and symbol
-            CANDLES: ['open', 'close', 'high', 'low'],
-            ORDER_INFO: ['status', 'price', 'amount'],
-            TRANSACTIONS: ['type', 'amount'],
-            BALANCES: ['balance'],
-            FILLS: ['price', 'amount']
+            CANDLES: ["open", "close", "high", "low"],
+            ORDER_INFO: ["status", "price", "amount"],
+            TRANSACTIONS: ["type", "amount"],
+            BALANCES: ["balance"],
+            FILLS: ["price", "amount"],
         }
         return common_cols + data_specific_cols.get(self.key, [])
 
@@ -78,17 +89,25 @@ async def writer(self):
             async with self.read_queue() as updates:
                 if updates:
                     df = pd.DataFrame(updates)
-                    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
-                    df['receipt_timestamp'] = pd.to_datetime(df['receipt_timestamp'], unit='s')
-                    df['year'], df['month'], df['day'] = df['date'].dt.year, df['date'].dt.month, df['date'].dt.day
-                    
+                    df["date"] = pd.to_datetime(df["timestamp"], unit="s")
+                    df["receipt_timestamp"] = pd.to_datetime(
+                        df["receipt_timestamp"], unit="s"
+                    )
+                    df["year"], df["month"], df["day"] = (
+                        df["date"].dt.year,
+                        df["date"].dt.month,
+                        df["date"].dt.day,
+                    )
+
                     # Reorder columns to put exchange and symbol first
-                    cols = ['exchange', 'symbol'] + [col for col in df.columns if col not in ['exchange', 'symbol']]
+                    cols = ["exchange", "symbol"] + [
+                        col for col in df.columns if col not in ["exchange", "symbol"]
+                    ]
                     df = df[cols]
-                    
+
                     if self.custom_columns:
                         df = df.rename(columns=self.custom_columns)
-                    
+
                     await self._write_batch(df)
 
     async def _write_batch(self, df: pd.DataFrame):
@@ -96,6 +115,11 @@ async def _write_batch(self, df: pd.DataFrame):
             return
 
         try:
+            # Ensure timestamp columns are in nanosecond precision
+            timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
+            for col in timestamp_columns:
+                df[col] = df[col].astype("datetime64[ns]")
+
             # Convert numeric columns to the specified numeric type
             numeric_columns = df.select_dtypes(include=[np.number]).columns
             for col in numeric_columns:
@@ -112,7 +136,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 mode="append",
                 partition_by=self.partition_cols,
                 schema_mode="merge",
-                storage_options=self.storage_options
+                storage_options=self.storage_options,
             )
             self.write_count += 1
 
@@ -252,6 +276,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
     - book: dict (contains full order book snapshot when available)
     """
+
     def __init__(self, *args, snapshots_only=False, snapshot_interval=1000, **kwargs):
         self.snapshots_only = snapshots_only
         self.snapshot_interval = snapshot_interval
@@ -359,4 +384,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     - liquidity: string
     - type: string
     - account: string (nullable)
-    """
\ No newline at end of file
+    """

From 265e7616f4b0d7a71f1a0987ef453ab23add0d78 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 17:47:21 +0200
Subject: [PATCH 43/87] feat: Refactor timestamp column handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 963488b29..fa6050edb 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -13,9 +13,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -115,10 +129,10 @@ async def _write_batch(self, df: pd.DataFrame):
             return
 
         try:
-            # Ensure timestamp columns are in nanosecond precision
+            # Convert timestamp columns from ns to us
             timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
             for col in timestamp_columns:
-                df[col] = df[col].astype("datetime64[ns]")
+                df[col] = df[col].astype("datetime64[us]")
 
             # Convert numeric columns to the specified numeric type
             numeric_columns = df.select_dtypes(include=[np.number]).columns

From 00d81ec358ca94d76a741cbd6f484767bb6cc83d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Sun, 1 Sep 2024 18:25:31 +0200
Subject: [PATCH 44/87] fix: Handle null values in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index fa6050edb..dca2dd07d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -82,7 +82,7 @@ def __init__(
         self.none_to = none_to
 
     def _default_z_order_cols(self) -> List[str]:
-        common_cols = ["exchange", "symbol", "timestamp"]
+        common_cols = ["timestamp"]
         data_specific_cols = {
             TRADES: ["price", "amount"],
             FUNDING: ["rate"],
@@ -96,7 +96,9 @@ def _default_z_order_cols(self) -> List[str]:
             BALANCES: ["balance"],
             FILLS: ["price", "amount"],
         }
-        return common_cols + data_specific_cols.get(self.key, [])
+        z_order_cols = common_cols + data_specific_cols.get(self.key, [])
+        # Remove any columns that are already in partition_cols
+        return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
         while self.running:
@@ -139,9 +141,20 @@ async def _write_batch(self, df: pd.DataFrame):
             for col in numeric_columns:
                 df[col] = df[col].astype(self.numeric_type)
 
-            # Replace None values with the specified value
+            # Handle null values
             if self.none_to is not None:
                 df = df.fillna(self.none_to)
+            else:
+                # Replace None with appropriate default values based on column type
+                for col in df.columns:
+                    if df[col].dtype == 'object':
+                        df[col] = df[col].fillna('')  # Replace None with empty string for object columns
+                    elif df[col].dtype in ['float64', 'int64']:
+                        df[col] = df[col].fillna(0)  # Replace None with 0 for numeric columns
+                    elif df[col].dtype == 'bool':
+                        df[col] = df[col].fillna(False)  # Replace None with False for boolean columns
+                    elif df[col].dtype == 'datetime64[us]':
+                        df[col] = df[col].fillna(pd.Timestamp.min)  # Replace None with minimum timestamp for datetime columns
 
             LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
             write_deltalake(

From d6ed8b90ca0d9fed66071690a89ff287318e02f7 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:51:21 +0200
Subject: [PATCH 45/87] feat: Implement DeltaLake backend for Cryptofeed

---
 cryptofeed/backends/deltalake.py | 357 +++++++++++++++++--------------
 1 file changed, 200 insertions(+), 157 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index dca2dd07d..d75277e1b 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -5,6 +5,7 @@
 associated with this software.
 """
 
+import asyncio
 import logging
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
@@ -13,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -55,13 +42,7 @@ def __init__(
         self.base_path = base_path
         self.delta_table_path = f"{self.base_path}/{self.key}"
         self.custom_columns = custom_columns or {}
-        self.partition_cols = partition_cols or [
-            "exchange",
-            "symbol",
-            "year",
-            "month",
-            "day",
-        ]
+        self.partition_cols = partition_cols or ["exchange", "symbol", "dt"]
         self.optimize_interval = optimize_interval
         self.z_order_cols = z_order_cols or self._default_z_order_cols()
         self.time_travel = time_travel
@@ -69,17 +50,31 @@ def __init__(
         self.write_count = 0
         self.running = True
 
-        if optimize_interval <= 0:
+        # Validate configuration parameters
+        self._validate_configuration()
+
+        self.numeric_type = numeric_type
+        self.none_to = none_to
+
+    def _validate_configuration(self):
+        if self.optimize_interval <= 0:
             raise ValueError("optimize_interval must be a positive integer")
 
-        if not isinstance(self.partition_cols, list):
+        if not isinstance(self.partition_cols, list) or not all(
+            isinstance(col, str) for col in self.partition_cols
+        ):
             raise TypeError("partition_cols must be a list of strings")
 
-        if not isinstance(self.z_order_cols, list):
+        if not isinstance(self.z_order_cols, list) or not all(
+            isinstance(col, str) for col in self.z_order_cols
+        ):
             raise TypeError("z_order_cols must be a list of strings")
 
-        self.numeric_type = numeric_type
-        self.none_to = none_to
+        if not isinstance(self.storage_options, dict):
+            raise TypeError("storage_options must be a dictionary")
+
+        if not isinstance(self.numeric_type, (type, str)):
+            raise TypeError("numeric_type must be a type or a string")
 
     def _default_z_order_cols(self) -> List[str]:
         common_cols = ["timestamp"]
@@ -104,17 +99,9 @@ async def writer(self):
         while self.running:
             async with self.read_queue() as updates:
                 if updates:
+                    LOG.info(f"Received {len(updates)} updates for processing.")
                     df = pd.DataFrame(updates)
-                    df["date"] = pd.to_datetime(df["timestamp"], unit="s")
-                    df["receipt_timestamp"] = pd.to_datetime(
-                        df["receipt_timestamp"], unit="s"
-                    )
-                    df["year"], df["month"], df["day"] = (
-                        df["date"].dt.year,
-                        df["date"].dt.month,
-                        df["date"].dt.day,
-                    )
-
+                    self._convert_fields(df)
                     # Reorder columns to put exchange and symbol first
                     cols = ["exchange", "symbol"] + [
                         col for col in df.columns if col not in ["exchange", "symbol"]
@@ -126,55 +113,126 @@ async def writer(self):
 
                     await self._write_batch(df)
 
+    def _convert_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting fields in DataFrame.")
+        self._convert_datetime_fields(df)
+        self._convert_category_fields(df)
+        self._convert_int_fields(df)
+
+    def _convert_datetime_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting datetime fields.")
+        datetime_columns = ["timestamp", "receipt_timestamp"]
+        for col in datetime_columns:
+            if col in df.columns:
+                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]")
+        if "timestamp" in df.columns:
+            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+
+    def _convert_category_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting category fields.")
+        category_columns = [
+            "exchange",
+            "symbol",
+            "side",
+            "type",
+            "status",
+            "currency",
+            "liquidity",
+        ]
+        for col in category_columns:
+            if col in df.columns:
+                df[col] = df[col].astype("category")
+
+    def _convert_int_fields(self, df: pd.DataFrame):
+        LOG.debug("Converting integer fields.")
+        int_columns = ["id", "trade_id", "trades"]
+        for col in int_columns:
+            if col in df.columns:
+                df[col] = df[col].astype("int64")
+
     async def _write_batch(self, df: pd.DataFrame):
         if df.empty:
+            LOG.warning("DataFrame is empty. Skipping write operation.")
             return
 
-        try:
-            # Convert timestamp columns from ns to us
-            timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
-            for col in timestamp_columns:
-                df[col] = df[col].astype("datetime64[us]")
-
-            # Convert numeric columns to the specified numeric type
-            numeric_columns = df.select_dtypes(include=[np.number]).columns
-            for col in numeric_columns:
-                df[col] = df[col].astype(self.numeric_type)
+        max_retries = 3
+        retry_delay = 5  # seconds
+
+        for attempt in range(max_retries):
+            try:
+                LOG.info(
+                    f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
+                )
+                # Convert timestamp columns to datetime64[ns]
+                timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
+                for col in timestamp_columns:
+                    df[col] = df[col].astype("datetime64[ns]")
+
+                # Convert numeric columns to the specified numeric type
+                numeric_columns = df.select_dtypes(include=[np.number]).columns
+                for col in numeric_columns:
+                    df[col] = df[col].astype(self.numeric_type)
+
+                # Handle null values
+                df = self._handle_null_values(df)
+
+                LOG.info(
+                    f"Writing batch of {len(df)} records to {self.delta_table_path}"
+                )
+                write_deltalake(
+                    self.delta_table_path,
+                    df,
+                    mode="append",
+                    partition_by=self.partition_cols,
+                    schema_mode="merge",
+                    storage_options=self.storage_options,
+                )
+                self.write_count += 1
+
+                if self.write_count % self.optimize_interval == 0:
+                    await self._optimize_table()
+
+                if self.time_travel:
+                    self._update_metadata()
+
+                LOG.info("Batch write successful.")
+                break  # Exit the retry loop if write is successful
+
+            except Exception as e:
+                LOG.error(
+                    f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
+                )
+                if attempt < max_retries - 1:
+                    LOG.info(f"Retrying in {retry_delay} seconds...")
+                    await asyncio.sleep(retry_delay)
+                else:
+                    LOG.error(
+                        "Max retries reached. Failed to write batch to Delta Lake."
+                    )
 
-            # Handle null values
-            if self.none_to is not None:
-                df = df.fillna(self.none_to)
-            else:
-                # Replace None with appropriate default values based on column type
-                for col in df.columns:
-                    if df[col].dtype == 'object':
-                        df[col] = df[col].fillna('')  # Replace None with empty string for object columns
-                    elif df[col].dtype in ['float64', 'int64']:
-                        df[col] = df[col].fillna(0)  # Replace None with 0 for numeric columns
-                    elif df[col].dtype == 'bool':
-                        df[col] = df[col].fillna(False)  # Replace None with False for boolean columns
-                    elif df[col].dtype == 'datetime64[us]':
-                        df[col] = df[col].fillna(pd.Timestamp.min)  # Replace None with minimum timestamp for datetime columns
-
-            LOG.info(f"Writing batch of {len(df)} records to {self.delta_table_path}")
-            write_deltalake(
-                self.delta_table_path,
-                df,
-                mode="append",
-                partition_by=self.partition_cols,
-                schema_mode="merge",
-                storage_options=self.storage_options,
-            )
-            self.write_count += 1
-
-            if self.write_count % self.optimize_interval == 0:
-                await self._optimize_table()
-
-            if self.time_travel:
-                self._update_metadata()
-
-        except Exception as e:
-            LOG.error(f"Error writing to Delta Lake: {e}")
+    def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
+        if self.none_to is not None:
+            return df.fillna(self.none_to)
+        else:
+            # Replace None with appropriate default values based on column type
+            for col in df.columns:
+                if df[col].dtype == "object":
+                    df[col] = df[col].fillna(
+                        ""
+                    )  # Replace None with empty string for object columns
+                elif df[col].dtype in ["float64", "int64"]:
+                    df[col] = df[col].fillna(
+                        0
+                    )  # Replace None with 0 for numeric columns
+                elif df[col].dtype == "bool":
+                    df[col] = df[col].fillna(
+                        False
+                    )  # Replace None with False for boolean columns
+                elif df[col].dtype == "datetime64[ns]":
+                    df[col] = df[col].fillna(
+                        pd.Timestamp.min
+                    )  # Replace None with minimum timestamp for datetime columns
+            return df
 
     async def _optimize_table(self):
         LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
@@ -182,21 +240,27 @@ async def _optimize_table(self):
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
+        LOG.info("OPTIMIZE operation completed.")
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         LOG.info(f"Updating metadata for time travel. Current version: {dt.version()}")
 
     async def stop(self):
+        LOG.info("Stopping DeltaLakeCallback writer.")
         self.running = False
 
     def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
         if self.time_travel:
             dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
             if timestamp:
-                return dt.version_at_timestamp(timestamp)
+                version = dt.version_at_timestamp(timestamp)
+                LOG.info(f"Retrieved version {version} for timestamp {timestamp}.")
+                return version
             else:
-                return dt.version()
+                version = dt.version()
+                LOG.info(f"Retrieved current version {version}.")
+                return version
         else:
             LOG.warning("Time travel is not enabled for this table")
             return None
@@ -208,16 +272,15 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
-    - id: string (nullable)
-    - side: string
+    - dt: string
+    - exchange: category
+    - symbol: category
+    - id: int64 (nullable)
+    - side: category
     - amount: float64
     - price: float64
-    - type: string (nullable)
+    - type: category (nullable)
+    - trade_id: int64
     """
 
 
@@ -227,11 +290,9 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - mark_price: float64 (nullable)
     - rate: float64
     - next_funding_time: datetime64[ns] (nullable)
@@ -245,11 +306,9 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - bid: float64
     - ask: float64
     """
@@ -261,11 +320,9 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - open_interest: float64
     """
 
@@ -276,16 +333,14 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
-    - side: string
+    - dt: string
+    - exchange: category
+    - symbol: category
+    - side: category
     - quantity: float64
     - price: float64
-    - id: string
-    - status: string
+    - id: int64
+    - status: category
     """
 
 
@@ -295,11 +350,9 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
     - book: dict (contains full order book snapshot when available)
     """
@@ -317,11 +370,9 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - start: datetime64[ns]
     - stop: datetime64[ns]
     - interval: string
@@ -341,16 +392,14 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
-    - id: string
+    - dt: string
+    - exchange: category
+    - symbol: category
+    - id: int64
     - client_order_id: string (nullable)
-    - side: string
-    - status: string
-    - type: string
+    - side: category
+    - status: category
+    - type: category
     - price: float64
     - amount: float64
     - remaining: float64 (nullable)
@@ -364,13 +413,11 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - currency: string
-    - type: string
-    - status: string
+    - dt: string
+    - exchange: category
+    - currency: category
+    - type: category
+    - status: category
     - amount: float64
     """
 
@@ -381,11 +428,9 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - currency: string
+    - dt: string
+    - exchange: category
+    - currency: category
     - balance: float64
     - reserved: float64 (nullable)
     """
@@ -397,18 +442,16 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[ns] (from 'date' column)
     - receipt_timestamp: datetime64[ns]
-    - year: int32
-    - month: int32
-    - day: int32
-    - exchange: string
-    - symbol: string
+    - dt: string
+    - exchange: category
+    - symbol: category
     - price: float64
     - amount: float64
-    - side: string
+    - side: category
     - fee: float64 (nullable)
-    - id: string
-    - order_id: string
-    - liquidity: string
-    - type: string
+    - id: int64
+    - order_id: int64
+    - liquidity: category
+    - type: category
     - account: string (nullable)
     """

From 16f282588b47c20a64b3981cb97a1eb670f5bb9f Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 12:57:16 +0200
Subject: [PATCH 46/87] fix: Refactor DeltaLakeCallback class

---
 cryptofeed/backends/deltalake.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d75277e1b..6e158f364 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -49,12 +63,10 @@ def __init__(
         self.storage_options = storage_options or {}
         self.write_count = 0
         self.running = True
-
-        # Validate configuration parameters
-        self._validate_configuration()
-
         self.numeric_type = numeric_type
         self.none_to = none_to
+        # Validate configuration parameters
+        self._validate_configuration()
 
     def _validate_configuration(self):
         if self.optimize_interval <= 0:

From d51babc3ed1975075222509b27a7f9e7c1ea350e Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 13:13:37 +0200
Subject: [PATCH 47/87] fix: Add debug logging for DataFrame schema in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 6e158f364..7991f84cb 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -175,6 +175,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.info(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
                 )
+                # Debug output the schema of the DataFrame
+                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
+
                 # Convert timestamp columns to datetime64[ns]
                 timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
                 for col in timestamp_columns:
@@ -191,6 +194,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.info(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
                 )
+                # Debug output the schema of the DataFrame
+                LOG.debug(f"DataFrame schema before write:\n{df.dtypes}")
+
                 write_deltalake(
                     self.delta_table_path,
                     df,

From 2a7712ec6002b58e4d3e6028b9a08f09895d8e29 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 13:20:07 +0200
Subject: [PATCH 48/87] fix: Add DataFrame schema logging when
 timestamp-related error occurs during Delta Lake write

---
 cryptofeed/backends/deltalake.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 7991f84cb..ffb47db7d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -217,9 +217,13 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
+                # When error is related to timestamp, print the schema of the DataFrame
+                if "timestamp" in str(e):
+                    LOG.error(f"DataFrame schema:\n{df.dtypes}")
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
                 )
+
                 if attempt < max_retries - 1:
                     LOG.info(f"Retrying in {retry_delay} seconds...")
                     await asyncio.sleep(retry_delay)

From f611215ba6b8dfa8ef1ef8d09fddf3ebde6430b5 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 16:36:58 +0200
Subject: [PATCH 49/87] fix: convert timestamp columns to datetime64[ms]

---
 cryptofeed/backends/deltalake.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index ffb47db7d..57084ec1a 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -136,7 +136,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame):
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ns]")
+                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
         if "timestamp" in df.columns:
             df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
 
@@ -178,10 +178,10 @@ async def _write_batch(self, df: pd.DataFrame):
                 # Debug output the schema of the DataFrame
                 LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
-                # Convert timestamp columns to datetime64[ns]
+                # Convert timestamp columns to datetime64[ms]
                 timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
                 for col in timestamp_columns:
-                    df[col] = df[col].astype("datetime64[ns]")
+                    df[col] = df[col].astype("datetime64[ms]")
 
                 # Convert numeric columns to the specified numeric type
                 numeric_columns = df.select_dtypes(include=[np.number]).columns

From a16f737c873c257cc0dac8e7b4ed25dfe348828d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 2 Sep 2024 16:59:19 +0200
Subject: [PATCH 50/87] fix: Ensure all partition columns are present in the
 DataFrame

---
 cryptofeed/backends/deltalake.py | 36 +++++++++++++++-----------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 57084ec1a..32e9a902a 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -167,6 +153,18 @@ async def _write_batch(self, df: pd.DataFrame):
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
 
+        # Ensure all partition columns are present in the DataFrame
+        for col in self.partition_cols:
+            if col not in df.columns:
+                if col == "exchange" or col == "symbol":
+                    df[col] = ""  # Default to empty string for categorical columns
+                elif col == "dt":
+                    df[col] = pd.Timestamp.min.strftime(
+                        "%Y-%m-%d"
+                    )  # Default to min date for date columns
+                else:
+                    df[col] = 0  # Default to 0 for numeric columns
+
         max_retries = 3
         retry_delay = 5  # seconds
 
@@ -218,8 +216,8 @@ async def _write_batch(self, df: pd.DataFrame):
 
             except Exception as e:
                 # When error is related to timestamp, print the schema of the DataFrame
-                if "timestamp" in str(e):
-                    LOG.error(f"DataFrame schema:\n{df.dtypes}")
+                LOG.error(f"DataFrame schema:\n{df.dtypes}")
+
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
                 )

From 4173cc2dd03e8b63f01ea5ef1d7a03fb3edfb036 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:00:25 +0200
Subject: [PATCH 51/87] fix: convert timestamp column to datetime64[ms] format

---
 cryptofeed/backends/deltalake.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 32e9a902a..cb9f98328 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -124,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame):
             if col in df.columns:
                 df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+            df["dt"] = df["timestamp"].dt.date.astype("string")
 
     def _convert_category_fields(self, df: pd.DataFrame):
         LOG.debug("Converting category fields.")
@@ -248,9 +262,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
                     df[col] = df[col].fillna(
                         False
                     )  # Replace None with False for boolean columns
-                elif df[col].dtype == "datetime64[ns]":
+                elif df[col].dtype == "datetime64[ms]":
                     df[col] = df[col].fillna(
-                        pd.Timestamp.min
+                        pd.Timestamp.min.astype("datetime64[ms]")
                     )  # Replace None with minimum timestamp for datetime columns
             return df
 

From b880bfa7c3f6a5cec83bd0e9dfe8f9b921b18bdb Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:11:45 +0200
Subject: [PATCH 52/87] feat: Convert timestamp column to date string format in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index cb9f98328..d05ebb06d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -138,7 +138,7 @@ def _convert_datetime_fields(self, df: pd.DataFrame):
             if col in df.columns:
                 df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].dt.date.astype("string")
+            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
 
     def _convert_category_fields(self, df: pd.DataFrame):
         LOG.debug("Converting category fields.")

From b02e46ccb7aeb63056578fd5c8eb80f574476811 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:32:15 +0200
Subject: [PATCH 53/87] refactor: Simplify null value handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d05ebb06d..e38a755c6 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -248,25 +248,18 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
         if self.none_to is not None:
             return df.fillna(self.none_to)
         else:
-            # Replace None with appropriate default values based on column type
             for col in df.columns:
-                if df[col].dtype == "object":
-                    df[col] = df[col].fillna(
-                        ""
-                    )  # Replace None with empty string for object columns
-                elif df[col].dtype in ["float64", "int64"]:
-                    df[col] = df[col].fillna(
-                        0
-                    )  # Replace None with 0 for numeric columns
-                elif df[col].dtype == "bool":
-                    df[col] = df[col].fillna(
-                        False
-                    )  # Replace None with False for boolean columns
-                elif df[col].dtype == "datetime64[ms]":
-                    df[col] = df[col].fillna(
-                        pd.Timestamp.min.astype("datetime64[ms]")
-                    )  # Replace None with minimum timestamp for datetime columns
-            return df
+                if pd.api.types.is_string_dtype(
+                    df[col]
+                ) or pd.api.types.is_categorical_dtype(df[col]):
+                    df[col] = df[col].fillna("")
+                elif pd.api.types.is_numeric_dtype(df[col]):
+                    df[col] = df[col].fillna(0)
+                elif pd.api.types.is_bool_dtype(df[col]):
+                    df[col] = df[col].fillna(False)
+                elif pd.api.types.is_datetime64_any_dtype(df[col]):
+                    df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]")
+        return df
 
     async def _optimize_table(self):
         LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")

From 358c162478de9cd828797a90fcc640087adfd7b0 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:42:41 +0200
Subject: [PATCH 54/87] fix: Ensure empty string is a category in categorical
 columns and handle null values correctly

---
 cryptofeed/backends/deltalake.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index e38a755c6..c15c85c10 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -153,7 +153,11 @@ def _convert_category_fields(self, df: pd.DataFrame):
         ]
         for col in category_columns:
             if col in df.columns:
-                df[col] = df[col].astype("category")
+                # Add empty string as a category if it's not already present
+                categories = df[col].unique().tolist()
+                if '' not in categories:
+                    categories.append('')
+                df[col] = pd.Categorical(df[col], categories=categories)
 
     def _convert_int_fields(self, df: pd.DataFrame):
         LOG.debug("Converting integer fields.")
@@ -249,9 +253,12 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
             return df.fillna(self.none_to)
         else:
             for col in df.columns:
-                if pd.api.types.is_string_dtype(
-                    df[col]
-                ) or pd.api.types.is_categorical_dtype(df[col]):
+                if pd.api.types.is_categorical_dtype(df[col]):
+                    # Ensure '' is in the categories before filling
+                    if '' not in df[col].cat.categories:
+                        df[col] = df[col].cat.add_categories([''])
+                    df[col] = df[col].fillna('')
+                elif pd.api.types.is_string_dtype(df[col]):
                     df[col] = df[col].fillna("")
                 elif pd.api.types.is_numeric_dtype(df[col]):
                     df[col] = df[col].fillna(0)

From 90a1559bdc7cf3849e266f25c3d045156eba5207 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:54:28 +0200
Subject: [PATCH 55/87] refactor: Refactor DeltaLakeCallback class to improve
 code readability and maintainability

---
 cryptofeed/backends/deltalake.py | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c15c85c10..b9ac6c296 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -128,7 +114,7 @@ async def writer(self):
     def _convert_fields(self, df: pd.DataFrame):
         LOG.debug("Converting fields in DataFrame.")
         self._convert_datetime_fields(df)
-        self._convert_category_fields(df)
+        # self._convert_category_fields(df)
         self._convert_int_fields(df)
 
     def _convert_datetime_fields(self, df: pd.DataFrame):
@@ -155,8 +141,8 @@ def _convert_category_fields(self, df: pd.DataFrame):
             if col in df.columns:
                 # Add empty string as a category if it's not already present
                 categories = df[col].unique().tolist()
-                if '' not in categories:
-                    categories.append('')
+                if "" not in categories:
+                    categories.append("")
                 df[col] = pd.Categorical(df[col], categories=categories)
 
     def _convert_int_fields(self, df: pd.DataFrame):
@@ -255,9 +241,9 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
             for col in df.columns:
                 if pd.api.types.is_categorical_dtype(df[col]):
                     # Ensure '' is in the categories before filling
-                    if '' not in df[col].cat.categories:
-                        df[col] = df[col].cat.add_categories([''])
-                    df[col] = df[col].fillna('')
+                    if "" not in df[col].cat.categories:
+                        df[col] = df[col].cat.add_categories([""])
+                    df[col] = df[col].fillna("")
                 elif pd.api.types.is_string_dtype(df[col]):
                     df[col] = df[col].fillna("")
                 elif pd.api.types.is_numeric_dtype(df[col]):

From 1365432523c91dab6aa2b3bb3c58fd638929a55d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:00:09 +0200
Subject: [PATCH 56/87] fix: Improve error handling and logging in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 34 +++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index b9ac6c296..0ed17520b 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -219,8 +233,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
-                # When error is related to timestamp, print the schema of the DataFrame
+                # When error is related to timestamp, print the schema of the DataFrame and the df
                 LOG.error(f"DataFrame schema:\n{df.dtypes}")
+                LOG.error(f"DataFrame:\n{df}")
 
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
@@ -239,19 +254,16 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
             return df.fillna(self.none_to)
         else:
             for col in df.columns:
-                if pd.api.types.is_categorical_dtype(df[col]):
-                    # Ensure '' is in the categories before filling
-                    if "" not in df[col].cat.categories:
-                        df[col] = df[col].cat.add_categories([""])
-                    df[col] = df[col].fillna("")
-                elif pd.api.types.is_string_dtype(df[col]):
+                if pd.api.types.is_string_dtype(df[col]):
                     df[col] = df[col].fillna("")
                 elif pd.api.types.is_numeric_dtype(df[col]):
                     df[col] = df[col].fillna(0)
                 elif pd.api.types.is_bool_dtype(df[col]):
                     df[col] = df[col].fillna(False)
                 elif pd.api.types.is_datetime64_any_dtype(df[col]):
-                    df[col] = df[col].fillna(pd.Timestamp.min).astype("datetime64[ms]")
+                    df[col] = df[col].fillna(pd.Timestamp.min)
+                else:
+                    df[col] = df[col].fillna(None)
         return df
 
     async def _optimize_table(self):

From ac5e61a553d56be1468446de0b1894c876ab87fb Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:07:09 +0200
Subject: [PATCH 57/87] fix: Optimize Delta Lake table by filling null values
 with empty strings

---
 cryptofeed/backends/deltalake.py | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 0ed17520b..ccc8232b5 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,23 +14,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -263,7 +249,7 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
                 elif pd.api.types.is_datetime64_any_dtype(df[col]):
                     df[col] = df[col].fillna(pd.Timestamp.min)
                 else:
-                    df[col] = df[col].fillna(None)
+                    df[col] = df[col].fillna("")
         return df
 
     async def _optimize_table(self):

From 3338d3cb19c16e7bb7c95bffe0362a04fd894036 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:44:08 +0200
Subject: [PATCH 58/87] fix: optimize handling of missing data in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index ccc8232b5..16a25e2b1 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -14,9 +14,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 LOG = logging.getLogger("feedhandler")
@@ -249,7 +263,8 @@ def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
                 elif pd.api.types.is_datetime64_any_dtype(df[col]):
                     df[col] = df[col].fillna(pd.Timestamp.min)
                 else:
-                    df[col] = df[col].fillna("")
+                    # For any other data types, use an empty string as a fallback
+                    df[col] = df[col].astype(object).fillna("")
         return df
 
     async def _optimize_table(self):

From 7c93b11688095577ca7fcbc583abe5799a15b406 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 03:55:54 +0200
Subject: [PATCH 59/87] feat: Add custom transformations and improve column
 validation in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 217 +++++++++++++++++++------------
 1 file changed, 131 insertions(+), 86 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 16a25e2b1..e135b87c0 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -49,6 +49,7 @@ def __init__(
         storage_options: Optional[Dict[str, Any]] = None,
         numeric_type: Union[type, str] = float,
         none_to: Any = None,
+        custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
         super().__init__()
@@ -65,6 +66,16 @@ def __init__(
         self.running = True
         self.numeric_type = numeric_type
         self.none_to = none_to
+        self.transformations = [
+            self._rename_custom_columns,
+            self._convert_datetime_columns,
+            self._convert_int_columns,
+            self._ensure_partition_columns,
+            self._handle_missing_values,
+            self._reorder_columns,
+        ]
+        if custom_transformations:
+            self.transformations.extend(custom_transformations)
         # Validate configuration parameters
         self._validate_configuration()
 
@@ -112,76 +123,146 @@ async def writer(self):
             async with self.read_queue() as updates:
                 if updates:
                     LOG.info(f"Received {len(updates)} updates for processing.")
+
                     df = pd.DataFrame(updates)
-                    self._convert_fields(df)
-                    # Reorder columns to put exchange and symbol first
-                    cols = ["exchange", "symbol"] + [
-                        col for col in df.columns if col not in ["exchange", "symbol"]
-                    ]
-                    df = df[cols]
 
-                    if self.custom_columns:
-                        df = df.rename(columns=self.custom_columns)
+                    self._transform_columns(df)
+                    self._validate_columns(df)
 
                     await self._write_batch(df)
 
-    def _convert_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting fields in DataFrame.")
-        self._convert_datetime_fields(df)
-        # self._convert_category_fields(df)
-        self._convert_int_fields(df)
+    def _validate_columns(self, df: pd.DataFrame):
+        LOG.debug("Validating DataFrame columns.")
+        # Check for required columns
+        required_columns = ["exchange", "symbol", "dt"]
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
+
+        # Validate partition columns
+        for col in self.partition_cols:
+            if col not in df.columns:
+                raise ValueError(f"Partition column '{col}' not found in DataFrame")
+            if df[col].isnull().any():
+                raise ValueError(f"Partition column '{col}' contains null values")
+
+        # Validate data types
+        expected_types = {
+            "exchange": "object",
+            "symbol": "object",
+            "dt": "object",
+            "timestamp": "datetime64[ms]",
+            "receipt_timestamp": "datetime64[ms]",
+        }
+        for col, expected_type in expected_types.items():
+            if col in df.columns and not df[col].dtype == expected_type:
+                raise TypeError(
+                    f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
+                )
+
+        LOG.debug("DataFrame columns validation completed successfully.")
+
+    def _transform_columns(self, df: pd.DataFrame):
+        LOG.debug("Transforming columns in DataFrame.")
+        for transformation in self.transformations:
+            transformation(df)
+
+    def _rename_custom_columns(self, df: pd.DataFrame):
+        if self.custom_columns:
+            LOG.debug("Renaming columns based on custom_columns configuration.")
+            df.rename(columns=self.custom_columns, inplace=True)
+
+    def _reorder_columns(self, df: pd.DataFrame):
+        LOG.debug("Reordering columns to prioritize exchange and symbol.")
+        cols = ["exchange", "symbol"] + [
+            col for col in df.columns if col not in ["exchange", "symbol"]
+        ]
+        df.reindex(columns=cols, inplace=True)
 
-    def _convert_datetime_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime fields.")
+    def _convert_datetime_columns(self, df: pd.DataFrame):
+        LOG.debug("Converting datetime columns.")
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                df[col] = pd.to_datetime(df[col], unit="ns").astype("datetime64[ms]")
+                df[col] = pd.to_datetime(df[col], unit="ms")
+
+        # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         if "timestamp" in df.columns:
             df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+        elif "receipt_timestamp" in df.columns:
+            df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d")
+        else:
+            LOG.warning("No timestamp column found. Using current date for 'dt'.")
+            df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d")
 
-    def _convert_category_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting category fields.")
-        category_columns = [
-            "exchange",
-            "symbol",
-            "side",
-            "type",
-            "status",
-            "currency",
-            "liquidity",
-        ]
-        for col in category_columns:
-            if col in df.columns:
-                # Add empty string as a category if it's not already present
-                categories = df[col].unique().tolist()
-                if "" not in categories:
-                    categories.append("")
-                df[col] = pd.Categorical(df[col], categories=categories)
-
-    def _convert_int_fields(self, df: pd.DataFrame):
-        LOG.debug("Converting integer fields.")
+    def _convert_int_columns(self, df: pd.DataFrame):
+        LOG.debug("Converting integer columns.")
         int_columns = ["id", "trade_id", "trades"]
         for col in int_columns:
             if col in df.columns:
-                df[col] = df[col].astype("int64")
+                df[col] = pd.to_numeric(df[col], errors="coerce").astype(
+                    "Int64"
+                )  # Use nullable integer type
 
-    async def _write_batch(self, df: pd.DataFrame):
-        if df.empty:
-            LOG.warning("DataFrame is empty. Skipping write operation.")
-            return
-
-        # Ensure all partition columns are present in the DataFrame
+    def _ensure_partition_columns(self, df: pd.DataFrame):
+        LOG.debug("Ensuring all partition columns are present and not null.")
         for col in self.partition_cols:
             if col not in df.columns:
-                if col == "exchange" or col == "symbol":
-                    df[col] = ""  # Default to empty string for categorical columns
+                if col in ["exchange", "symbol"]:
+                    df[col] = "unknown"
                 elif col == "dt":
-                    df[col] = pd.Timestamp.min.strftime(
-                        "%Y-%m-%d"
-                    )  # Default to min date for date columns
+                    # 'dt' should already be created in _convert_datetime_columns
+                    LOG.warning("'dt' column not found. This should not happen.")
+                    df[col] = pd.Timestamp.now().strftime("%Y-%m-%d")
                 else:
-                    df[col] = 0  # Default to 0 for numeric columns
+                    df[col] = "unknown"
+
+            # Fill any remaining null values
+            if df[col].isnull().any():
+                LOG.warning(
+                    f"Found null values in partition column {col}. Filling with default values."
+                )
+                df[col] = df[col].fillna(
+                    "unknown"
+                    if col != "dt"
+                    else pd.Timestamp.now().strftime("%Y-%m-%d")
+                )
+
+    def _handle_missing_values(self, df: pd.DataFrame):
+        LOG.debug("Handling missing values.")
+        for col in df.columns:
+            if col in ["exchange", "symbol"]:  # Removed 'dt' from this list
+                # These are partition columns and should never be null
+                if df[col].isnull().any():
+                    LOG.warning(
+                        f"Found null values in partition column {col}. Filling with default values."
+                    )
+                    df[col] = df[col].fillna("unknown")
+            elif pd.api.types.is_numeric_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else 0
+                )
+            elif pd.api.types.is_string_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else ""
+                )
+            elif pd.api.types.is_bool_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else False
+                )
+            elif pd.api.types.is_datetime64_any_dtype(df[col]):
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else pd.NaT
+                )
+            else:
+                df[col] = df[col].fillna(
+                    self.none_to if self.none_to is not None else ""
+                )
+
+    async def _write_batch(self, df: pd.DataFrame):
+        if df.empty:
+            LOG.warning("DataFrame is empty. Skipping write operation.")
+            return
 
         max_retries = 3
         retry_delay = 5  # seconds
@@ -191,27 +272,11 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.info(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
                 )
-                # Debug output the schema of the DataFrame
                 LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
-                # Convert timestamp columns to datetime64[ms]
-                timestamp_columns = df.select_dtypes(include=["datetime64"]).columns
-                for col in timestamp_columns:
-                    df[col] = df[col].astype("datetime64[ms]")
-
-                # Convert numeric columns to the specified numeric type
-                numeric_columns = df.select_dtypes(include=[np.number]).columns
-                for col in numeric_columns:
-                    df[col] = df[col].astype(self.numeric_type)
-
-                # Handle null values
-                df = self._handle_null_values(df)
-
                 LOG.info(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
                 )
-                # Debug output the schema of the DataFrame
-                LOG.debug(f"DataFrame schema before write:\n{df.dtypes}")
 
                 write_deltalake(
                     self.delta_table_path,
@@ -233,10 +298,8 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
-                # When error is related to timestamp, print the schema of the DataFrame and the df
                 LOG.error(f"DataFrame schema:\n{df.dtypes}")
                 LOG.error(f"DataFrame:\n{df}")
-
                 LOG.error(
                     f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
                 )
@@ -249,24 +312,6 @@ async def _write_batch(self, df: pd.DataFrame):
                         "Max retries reached. Failed to write batch to Delta Lake."
                     )
 
-    def _handle_null_values(self, df: pd.DataFrame) -> pd.DataFrame:
-        if self.none_to is not None:
-            return df.fillna(self.none_to)
-        else:
-            for col in df.columns:
-                if pd.api.types.is_string_dtype(df[col]):
-                    df[col] = df[col].fillna("")
-                elif pd.api.types.is_numeric_dtype(df[col]):
-                    df[col] = df[col].fillna(0)
-                elif pd.api.types.is_bool_dtype(df[col]):
-                    df[col] = df[col].fillna(False)
-                elif pd.api.types.is_datetime64_any_dtype(df[col]):
-                    df[col] = df[col].fillna(pd.Timestamp.min)
-                else:
-                    # For any other data types, use an empty string as a fallback
-                    df[col] = df[col].astype(object).fillna("")
-        return df
-
     async def _optimize_table(self):
         LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From bdea1ff0716b15564dab43c445d681de1680a0c7 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:11:14 +0200
Subject: [PATCH 60/87] fix: Add logging configuration to deltalake backend

---
 cryptofeed/backends/deltalake.py | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index e135b87c0..94628ad1e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -10,28 +10,16 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
 
-import numpy as np
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
+# Add these lines after the imports
+logging.basicConfig(level=logging.DEBUG)
+logging.getLogger().setLevel(logging.DEBUG)
 
 LOG = logging.getLogger("feedhandler")
 

From 11c7b222c0e703795314f0419b787bc5bf3f5b80 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:19:36 +0200
Subject: [PATCH 61/87] fix: Initialize DeltaLakeCallback and add logging for
 writer method and _write_batch

---
 cryptofeed/backends/deltalake.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 94628ad1e..91c139d5b 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -40,6 +40,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
+        LOG.debug("Initializing DeltaLakeCallback")
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -107,8 +108,10 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
+        LOG.debug("Writer method called")
         while self.running:
             async with self.read_queue() as updates:
+                LOG.debug(f"Read queue returned {len(updates)} updates")
                 if updates:
                     LOG.info(f"Received {len(updates)} updates for processing.")
 
@@ -248,6 +251,7 @@ def _handle_missing_values(self, df: pd.DataFrame):
                 )
 
     async def _write_batch(self, df: pd.DataFrame):
+        LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}")
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return

From d0ccd929d68daf95d1f307d56e1a1a7c0bc0e912 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:23:20 +0200
Subject: [PATCH 62/87] fix: Change logging levels from DEBUG to WARNING in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 43 ++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 91c139d5b..68250f6c8 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -17,9 +17,10 @@
 from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
                                 OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
+
 # Add these lines after the imports
-logging.basicConfig(level=logging.DEBUG)
-logging.getLogger().setLevel(logging.DEBUG)
+# logging.basicConfig(level=logging.DEBUG)
+# logging.getLogger().setLevel(logging.DEBUG)
 
 LOG = logging.getLogger("feedhandler")
 
@@ -40,7 +41,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
-        LOG.debug("Initializing DeltaLakeCallback")
+        LOG.warning("Initializing DeltaLakeCallback")  # Changed to warning
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -108,12 +109,16 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
-        LOG.debug("Writer method called")
+        LOG.warning("Writer method called")  # Changed to warning
         while self.running:
             async with self.read_queue() as updates:
-                LOG.debug(f"Read queue returned {len(updates)} updates")
+                LOG.warning(
+                    f"Read queue returned {len(updates)} updates"
+                )  # Changed to warning
                 if updates:
-                    LOG.info(f"Received {len(updates)} updates for processing.")
+                    LOG.warning(
+                        f"Received {len(updates)} updates for processing."
+                    )  # Changed to warning
 
                     df = pd.DataFrame(updates)
 
@@ -251,7 +256,9 @@ def _handle_missing_values(self, df: pd.DataFrame):
                 )
 
     async def _write_batch(self, df: pd.DataFrame):
-        LOG.debug(f"_write_batch called with DataFrame of shape {df.shape}")
+        LOG.warning(
+            f"_write_batch called with DataFrame of shape {df.shape}"
+        )  # Changed to warning
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
@@ -261,14 +268,14 @@ async def _write_batch(self, df: pd.DataFrame):
 
         for attempt in range(max_retries):
             try:
-                LOG.info(
+                LOG.warning(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
-                )
-                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
+                )  # Changed to warning
+                LOG.warning(f"DataFrame schema:\n{df.dtypes}")  # Changed to warning
 
-                LOG.info(
+                LOG.warning(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
-                )
+                )  # Changed to warning
 
                 write_deltalake(
                     self.delta_table_path,
@@ -286,7 +293,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 if self.time_travel:
                     self._update_metadata()
 
-                LOG.info("Batch write successful.")
+                LOG.warning("Batch write successful.")  # Changed to warning
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
@@ -297,7 +304,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 )
 
                 if attempt < max_retries - 1:
-                    LOG.info(f"Retrying in {retry_delay} seconds...")
+                    LOG.warning(
+                        f"Retrying in {retry_delay} seconds..."
+                    )  # Changed to warning
                     await asyncio.sleep(retry_delay)
                 else:
                     LOG.error(
@@ -305,12 +314,14 @@ async def _write_batch(self, df: pd.DataFrame):
                     )
 
     async def _optimize_table(self):
-        LOG.info(f"Running OPTIMIZE on table {self.delta_table_path}")
+        LOG.warning(
+            f"Running OPTIMIZE on table {self.delta_table_path}"
+        )  # Changed to warning
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
-        LOG.info("OPTIMIZE operation completed.")
+        LOG.warning("OPTIMIZE operation completed.")  # Changed to warning
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From a8d631017c85849b3820911ff82ae0e6420f649f Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:49:40 +0200
Subject: [PATCH 63/87] fix: Improve logging and error handling in
 DeltaLakeCallback writer method

---
 cryptofeed/backends/deltalake.py | 61 ++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 68250f6c8..81f6f34b8 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -13,9 +13,23 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
-from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
-                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
+from cryptofeed.backends.backend import (
+    BackendBookCallback,
+    BackendCallback,
+    BackendQueue,
+)
+from cryptofeed.defines import (
+    BALANCES,
+    CANDLES,
+    FILLS,
+    FUNDING,
+    LIQUIDATIONS,
+    OPEN_INTEREST,
+    ORDER_INFO,
+    TICKER,
+    TRADES,
+    TRANSACTIONS,
+)
 
 
 # Add these lines after the imports
@@ -109,23 +123,32 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
-        LOG.warning("Writer method called")  # Changed to warning
+        LOG.warning("Writer method started")
         while self.running:
-            async with self.read_queue() as updates:
-                LOG.warning(
-                    f"Read queue returned {len(updates)} updates"
-                )  # Changed to warning
-                if updates:
-                    LOG.warning(
-                        f"Received {len(updates)} updates for processing."
-                    )  # Changed to warning
-
-                    df = pd.DataFrame(updates)
-
-                    self._transform_columns(df)
-                    self._validate_columns(df)
-
-                    await self._write_batch(df)
+            try:
+                async with self.read_queue() as updates:
+                    LOG.warning(f"Read queue returned {len(updates)} updates")
+                    if updates:
+                        LOG.warning(f"Received {len(updates)} updates for processing.")
+                        df = pd.DataFrame(updates)
+                        LOG.warning(f"Created DataFrame with shape: {df.shape}")
+
+                        LOG.warning("Starting field transformation")
+                        self._transform_fields(df)
+                        LOG.warning("Field transformation completed")
+
+                        LOG.warning("Validating columns")
+                        self._validate_columns(df)
+                        LOG.warning("Columns validation completed")
+
+                        LOG.warning("Starting batch write")
+                        await self._write_batch(df)
+                        LOG.warning("Batch write completed")
+                    else:
+                        LOG.warning("No updates received, continuing loop")
+            except Exception as e:
+                LOG.error(f"Error in writer method: {e}", exc_info=True)
+        LOG.warning("Writer method ended")
 
     def _validate_columns(self, df: pd.DataFrame):
         LOG.debug("Validating DataFrame columns.")

From 4946bb843e520e9d1d1c385f5ea3fbc6760ce956 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 04:59:06 +0200
Subject: [PATCH 64/87] fix: Refactor field transformation in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 81f6f34b8..0ce9b9116 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -13,23 +13,9 @@
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
-from cryptofeed.backends.backend import (
-    BackendBookCallback,
-    BackendCallback,
-    BackendQueue,
-)
-from cryptofeed.defines import (
-    BALANCES,
-    CANDLES,
-    FILLS,
-    FUNDING,
-    LIQUIDATIONS,
-    OPEN_INTEREST,
-    ORDER_INFO,
-    TICKER,
-    TRADES,
-    TRANSACTIONS,
-)
+from cryptofeed.backends.backend import BackendBookCallback, BackendCallback, BackendQueue
+from cryptofeed.defines import (BALANCES, CANDLES, FILLS, FUNDING, LIQUIDATIONS,
+                                OPEN_INTEREST, ORDER_INFO, TICKER, TRADES, TRANSACTIONS)
 
 
 # Add these lines after the imports
@@ -134,7 +120,7 @@ async def writer(self):
                         LOG.warning(f"Created DataFrame with shape: {df.shape}")
 
                         LOG.warning("Starting field transformation")
-                        self._transform_fields(df)
+                        self._transform_columns(df)
                         LOG.warning("Field transformation completed")
 
                         LOG.warning("Validating columns")

From b1cad2b37250a68b3a3b6887f6d011f577ecd24c Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 05:15:11 +0200
Subject: [PATCH 65/87] fix: Reorder columns to prioritize exchange and symbol

---
 cryptofeed/backends/deltalake.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 0ce9b9116..314bae1bf 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -62,7 +62,6 @@ def __init__(
             self._convert_int_columns,
             self._ensure_partition_columns,
             self._handle_missing_values,
-            self._reorder_columns,
         ]
         if custom_transformations:
             self.transformations.extend(custom_transformations)
@@ -179,10 +178,9 @@ def _rename_custom_columns(self, df: pd.DataFrame):
 
     def _reorder_columns(self, df: pd.DataFrame):
         LOG.debug("Reordering columns to prioritize exchange and symbol.")
-        cols = ["exchange", "symbol"] + [
-            col for col in df.columns if col not in ["exchange", "symbol"]
-        ]
-        df.reindex(columns=cols, inplace=True)
+        priority_cols = ["exchange", "symbol"]
+        other_cols = [col for col in df.columns if col not in priority_cols]
+        df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
         LOG.debug("Converting datetime columns.")

From 6cfd5f71f88e4ab7acfc76ccf15ea1b360af7afd Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 05:22:38 +0200
Subject: [PATCH 66/87] fix: Ensure datetime columns have millisecond precision
 in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 314bae1bf..ee7027884 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -155,14 +155,18 @@ def _validate_columns(self, df: pd.DataFrame):
             "exchange": "object",
             "symbol": "object",
             "dt": "object",
-            "timestamp": "datetime64[ms]",
-            "receipt_timestamp": "datetime64[ms]",
+            "timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
+            "receipt_timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
         }
         for col, expected_type in expected_types.items():
-            if col in df.columns and not df[col].dtype == expected_type:
-                raise TypeError(
-                    f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
-                )
+            if col in df.columns:
+                if expected_type.startswith("datetime64"):
+                    # Convert to millisecond precision if it's a datetime column
+                    df[col] = df[col].astype('datetime64[ms]')
+                if not df[col].dtype == expected_type:
+                    raise TypeError(
+                        f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
+                    )
 
         LOG.debug("DataFrame columns validation completed successfully.")
 
@@ -187,7 +191,8 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                df[col] = pd.to_datetime(df[col], unit="ms")
+                # Convert to millisecond precision
+                df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]')
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         if "timestamp" in df.columns:

From d213cf00fe832e1c58577fd28de09dc0c515c05f Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 3 Sep 2024 05:39:57 +0200
Subject: [PATCH 67/87] feat: Ensure datetime columns are in millisecond
 precision

---
 cryptofeed/backends/deltalake.py | 95 ++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 35 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index ee7027884..f3b8eff50 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame):
             "exchange": "object",
             "symbol": "object",
             "dt": "object",
-            "timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
-            "receipt_timestamp": "datetime64[ms]",  # Keep as 'datetime64[ms]'
+            "timestamp": "datetime64[ms]",
+            "receipt_timestamp": "datetime64[ms]",
         }
         for col, expected_type in expected_types.items():
             if col in df.columns:
-                if expected_type.startswith("datetime64"):
-                    # Convert to millisecond precision if it's a datetime column
-                    df[col] = df[col].astype('datetime64[ms]')
+                if expected_type == "datetime64[ms]":
+                    # Ensure datetime columns are in millisecond precision
+                    df[col] = df[col].astype("datetime64[ms]")
                 if not df[col].dtype == expected_type:
                     raise TypeError(
                         f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
@@ -187,22 +187,47 @@ def _reorder_columns(self, df: pd.DataFrame):
         df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime columns.")
+        LOG.debug("Converting datetime columns to millisecond precision.")
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
-                # Convert to millisecond precision
-                df[col] = pd.to_datetime(df[col], unit='ms').astype('datetime64[ms]')
+                # Log sample of original values
+                LOG.warning(
+                    f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
+                )
+                # Convert to millisecond precision, handling both string and datetime inputs
+                df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]")
+                # Log sample of converted values in readable format
+                if len(df) > 0:
+                    readable_time = (
+                        df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+                    )
+                    LOG.warning(f"Sample {col} after conversion: {readable_time}")
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
+        min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].dt.strftime("%Y-%m-%d")
+            df["dt"] = (
+                df["timestamp"]
+                .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now())
+                .dt.strftime("%Y-%m-%d")
+            )
         elif "receipt_timestamp" in df.columns:
-            df["dt"] = df["receipt_timestamp"].dt.strftime("%Y-%m-%d")
+            df["dt"] = (
+                df["receipt_timestamp"]
+                .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now())
+                .dt.strftime("%Y-%m-%d")
+            )
         else:
             LOG.warning("No timestamp column found. Using current date for 'dt'.")
             df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d")
 
+        # Log sample of 'dt' column
+        if "dt" in df.columns and len(df) > 0:
+            LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}")
+
+        LOG.debug("Datetime columns converted to millisecond precision.")
+
     def _convert_int_columns(self, df: pd.DataFrame):
         LOG.debug("Converting integer columns.")
         int_columns = ["id", "trade_id", "trades"]
@@ -363,8 +388,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRADES
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -381,14 +406,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FUNDING
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
     - mark_price: float64 (nullable)
     - rate: float64
-    - next_funding_time: datetime64[ns] (nullable)
+    - next_funding_time: datetime64[ms] (nullable)
     - predicted_rate: float64 (nullable)
     """
 
@@ -397,8 +422,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TICKER
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -411,8 +436,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = OPEN_INTEREST
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -424,8 +449,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = LIQUIDATIONS
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -441,8 +466,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     default_key = "book"
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -461,13 +486,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
-    - start: datetime64[ns]
-    - stop: datetime64[ns]
+    - start: datetime64[ms]
+    - stop: datetime64[ms]
     - interval: string
     - trades: int64 (nullable)
     - open: float64
@@ -483,8 +508,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = ORDER_INFO
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category
@@ -504,8 +529,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRANSACTIONS
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - currency: category
@@ -519,8 +544,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = BALANCES
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - currency: category
@@ -533,8 +558,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FILLS
     """
     Schema:
-    - timestamp: datetime64[ns] (from 'date' column)
-    - receipt_timestamp: datetime64[ns]
+    - timestamp: datetime64[ms] (from 'date' column)
+    - receipt_timestamp: datetime64[ms]
     - dt: string
     - exchange: category
     - symbol: category

From adac33bc4f8bd4dd71811c907cf472405e90d455 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 00:42:34 +0200
Subject: [PATCH 68/87] chore: Convert datetime columns to microsecond
 precision

---
 .gitignore                       |  1 +
 cryptofeed/backends/deltalake.py | 72 ++++++++++++++++----------------
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/.gitignore b/.gitignore
index ac64f2b9e..ed13eb5fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,4 @@ ENV/
 # PyCharm
 .idea/
 .aider*
+.trunk/
diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index f3b8eff50..af5d6b31e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -155,14 +155,14 @@ def _validate_columns(self, df: pd.DataFrame):
             "exchange": "object",
             "symbol": "object",
             "dt": "object",
-            "timestamp": "datetime64[ms]",
-            "receipt_timestamp": "datetime64[ms]",
+            "timestamp": "datetime64[us]",
+            "receipt_timestamp": "datetime64[us]",
         }
         for col, expected_type in expected_types.items():
             if col in df.columns:
-                if expected_type == "datetime64[ms]":
-                    # Ensure datetime columns are in millisecond precision
-                    df[col] = df[col].astype("datetime64[ms]")
+                if expected_type == "datetime64[us]":
+                    # Ensure datetime columns are in microsecond precision
+                    df[col] = df[col].astype("datetime64[us]")
                 if not df[col].dtype == expected_type:
                     raise TypeError(
                         f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
@@ -187,7 +187,7 @@ def _reorder_columns(self, df: pd.DataFrame):
         df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime columns to millisecond precision.")
+        LOG.debug("Converting datetime columns to microsecond precision.")
         datetime_columns = ["timestamp", "receipt_timestamp"]
         for col in datetime_columns:
             if col in df.columns:
@@ -195,13 +195,11 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
                 LOG.warning(
                     f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
                 )
-                # Convert to millisecond precision, handling both string and datetime inputs
-                df[col] = pd.to_datetime(df[col]).astype("datetime64[ms]")
+                # Convert to microsecond precision, handling both string and datetime inputs
+                df[col] = pd.to_datetime(df[col]).astype("datetime64[us]")
                 # Log sample of converted values in readable format
                 if len(df) > 0:
-                    readable_time = (
-                        df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
-                    )
+                    readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")
                     LOG.warning(f"Sample {col} after conversion: {readable_time}")
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
@@ -226,7 +224,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         if "dt" in df.columns and len(df) > 0:
             LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}")
 
-        LOG.debug("Datetime columns converted to millisecond precision.")
+        LOG.debug("Datetime columns converted to microsecond precision.")
 
     def _convert_int_columns(self, df: pd.DataFrame):
         LOG.debug("Converting integer columns.")
@@ -388,8 +386,8 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRADES
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -406,14 +404,14 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FUNDING
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
     - mark_price: float64 (nullable)
     - rate: float64
-    - next_funding_time: datetime64[ms] (nullable)
+    - next_funding_time: datetime64[us] (nullable)
     - predicted_rate: float64 (nullable)
     """
 
@@ -422,8 +420,8 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TICKER
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -436,8 +434,8 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = OPEN_INTEREST
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -449,8 +447,8 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = LIQUIDATIONS
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -466,8 +464,8 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     default_key = "book"
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -486,13 +484,13 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = CANDLES
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
-    - start: datetime64[ms]
-    - stop: datetime64[ms]
+    - start: datetime64[us]
+    - stop: datetime64[us]
     - interval: string
     - trades: int64 (nullable)
     - open: float64
@@ -508,8 +506,8 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = ORDER_INFO
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category
@@ -529,8 +527,8 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = TRANSACTIONS
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - currency: category
@@ -544,8 +542,8 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = BALANCES
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - currency: category
@@ -558,8 +556,8 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     default_key = FILLS
     """
     Schema:
-    - timestamp: datetime64[ms] (from 'date' column)
-    - receipt_timestamp: datetime64[ms]
+    - timestamp: datetime64[us] (from 'date' column)
+    - receipt_timestamp: datetime64[us]
     - dt: string
     - exchange: category
     - symbol: category

From 4e2aff6dda10c0248f71a28e3ca5be88659cafc8 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 01:04:50 +0200
Subject: [PATCH 69/87] fix: Change log levels from warning to debug for
 non-critical messages

---
 cryptofeed/backends/deltalake.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index af5d6b31e..412794626 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -41,7 +41,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
-        LOG.warning("Initializing DeltaLakeCallback")  # Changed to warning
+        LOG.warning("Initializing DeltaLakeCallback")
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -192,7 +192,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         for col in datetime_columns:
             if col in df.columns:
                 # Log sample of original values
-                LOG.warning(
+                LOG.debug(
                     f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
                 )
                 # Convert to microsecond precision, handling both string and datetime inputs
@@ -200,7 +200,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
                 # Log sample of converted values in readable format
                 if len(df) > 0:
                     readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")
-                    LOG.warning(f"Sample {col} after conversion: {readable_time}")
+                    LOG.debug(f"Sample {col} after conversion: {readable_time}")
 
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
@@ -222,7 +222,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
 
         # Log sample of 'dt' column
         if "dt" in df.columns and len(df) > 0:
-            LOG.warning(f"Sample 'dt' value: {df['dt'].iloc[0]}")
+            LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}")
 
         LOG.debug("Datetime columns converted to microsecond precision.")
 
@@ -293,7 +293,7 @@ def _handle_missing_values(self, df: pd.DataFrame):
     async def _write_batch(self, df: pd.DataFrame):
         LOG.warning(
             f"_write_batch called with DataFrame of shape {df.shape}"
-        )  # Changed to warning
+        )
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
@@ -305,12 +305,12 @@ async def _write_batch(self, df: pd.DataFrame):
             try:
                 LOG.warning(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
-                )  # Changed to warning
-                LOG.warning(f"DataFrame schema:\n{df.dtypes}")  # Changed to warning
+                )
+                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
                 LOG.warning(
                     f"Writing batch of {len(df)} records to {self.delta_table_path}"
-                )  # Changed to warning
+                )
 
                 write_deltalake(
                     self.delta_table_path,
@@ -328,7 +328,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 if self.time_travel:
                     self._update_metadata()
 
-                LOG.warning("Batch write successful.")  # Changed to warning
+                LOG.warning("Batch write successful.")
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
@@ -341,7 +341,7 @@ async def _write_batch(self, df: pd.DataFrame):
                 if attempt < max_retries - 1:
                     LOG.warning(
                         f"Retrying in {retry_delay} seconds..."
-                    )  # Changed to warning
+                    )
                     await asyncio.sleep(retry_delay)
                 else:
                     LOG.error(
@@ -351,12 +351,12 @@ async def _write_batch(self, df: pd.DataFrame):
     async def _optimize_table(self):
         LOG.warning(
             f"Running OPTIMIZE on table {self.delta_table_path}"
-        )  # Changed to warning
+        )
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
-        LOG.warning("OPTIMIZE operation completed.")  # Changed to warning
+        LOG.warning("OPTIMIZE operation completed.")
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From 99b3a366bc9f50d96d4d84d1eece4da146031536 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 01:25:25 +0200
Subject: [PATCH 70/87] refactor: Simplify datetime column handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 40 +++++++++++++-------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 412794626..d11c3c596 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -205,20 +205,12 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
         min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
         if "timestamp" in df.columns:
-            df["dt"] = (
-                df["timestamp"]
-                .where(df["timestamp"] >= min_valid_date, pd.Timestamp.now())
-                .dt.strftime("%Y-%m-%d")
-            )
+            df["dt"] = df["timestamp"].where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date
         elif "receipt_timestamp" in df.columns:
-            df["dt"] = (
-                df["receipt_timestamp"]
-                .where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now())
-                .dt.strftime("%Y-%m-%d")
-            )
+            df["dt"] = df["receipt_timestamp"].where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date
         else:
             LOG.warning("No timestamp column found. Using current date for 'dt'.")
-            df["dt"] = pd.Timestamp.now().strftime("%Y-%m-%d")
+            df["dt"] = pd.Timestamp.now().date()
 
         # Log sample of 'dt' column
         if "dt" in df.columns and len(df) > 0:
@@ -244,7 +236,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame):
                 elif col == "dt":
                     # 'dt' should already be created in _convert_datetime_columns
                     LOG.warning("'dt' column not found. This should not happen.")
-                    df[col] = pd.Timestamp.now().strftime("%Y-%m-%d")
+                    df[col] = pd.Timestamp.now().date()
                 else:
                     df[col] = "unknown"
 
@@ -256,7 +248,7 @@ def _ensure_partition_columns(self, df: pd.DataFrame):
                 df[col] = df[col].fillna(
                     "unknown"
                     if col != "dt"
-                    else pd.Timestamp.now().strftime("%Y-%m-%d")
+                    else pd.Timestamp.now().date()
                 )
 
     def _handle_missing_values(self, df: pd.DataFrame):
@@ -388,7 +380,7 @@ class TradeDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - id: int64 (nullable)
@@ -406,7 +398,7 @@ class FundingDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - mark_price: float64 (nullable)
@@ -422,7 +414,7 @@ class TickerDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - bid: float64
@@ -436,7 +428,7 @@ class OpenInterestDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - open_interest: float64
@@ -449,7 +441,7 @@ class LiquidationsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - side: category
@@ -466,7 +458,7 @@ class BookDeltaLake(DeltaLakeCallback, BackendBookCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - delta: dict (nullable, contains 'bid' and 'ask' updates)
@@ -486,7 +478,7 @@ class CandlesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - start: datetime64[us]
@@ -508,7 +500,7 @@ class OrderInfoDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - id: int64
@@ -529,7 +521,7 @@ class TransactionsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - currency: category
     - type: category
@@ -544,7 +536,7 @@ class BalancesDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - currency: category
     - balance: float64
@@ -558,7 +550,7 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     Schema:
     - timestamp: datetime64[us] (from 'date' column)
     - receipt_timestamp: datetime64[us]
-    - dt: string
+    - dt: date
     - exchange: category
     - symbol: category
     - price: float64

From 1b25acd9c27ebb9bc765fcdc744fba20f4bb15f8 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 02:05:38 +0200
Subject: [PATCH 71/87] feat: Add batch processing and flush interval to
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 53 +++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d11c3c596..5bfa7bd24 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import logging
+import time
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
 
@@ -38,6 +39,8 @@ def __init__(
         storage_options: Optional[Dict[str, Any]] = None,
         numeric_type: Union[type, str] = float,
         none_to: Any = None,
+        batch_size: int = 1000,
+        flush_interval: float = 60.0,
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
@@ -67,6 +70,10 @@ def __init__(
             self.transformations.extend(custom_transformations)
         # Validate configuration parameters
         self._validate_configuration()
+        self.batch_size = batch_size
+        self.flush_interval = flush_interval
+        self.batch = []
+        self.last_flush_time = time.time()
 
     def _validate_configuration(self):
         if self.optimize_interval <= 0:
@@ -115,26 +122,41 @@ async def writer(self):
                     LOG.warning(f"Read queue returned {len(updates)} updates")
                     if updates:
                         LOG.warning(f"Received {len(updates)} updates for processing.")
-                        df = pd.DataFrame(updates)
-                        LOG.warning(f"Created DataFrame with shape: {df.shape}")
+                        self.batch.extend(updates)
 
-                        LOG.warning("Starting field transformation")
-                        self._transform_columns(df)
-                        LOG.warning("Field transformation completed")
-
-                        LOG.warning("Validating columns")
-                        self._validate_columns(df)
-                        LOG.warning("Columns validation completed")
-
-                        LOG.warning("Starting batch write")
-                        await self._write_batch(df)
-                        LOG.warning("Batch write completed")
+                        if len(self.batch) >= self.batch_size or (time.time() - self.last_flush_time) >= self.flush_interval:
+                            await self._process_batch()
                     else:
-                        LOG.warning("No updates received, continuing loop")
+                        # Check if we need to flush based on time
+                        if (time.time() - self.last_flush_time) >= self.flush_interval and self.batch:
+                            await self._process_batch()
+                        else:
+                            LOG.warning("No updates received, continuing loop")
+                            await asyncio.sleep(1)  # Add a small delay to prevent busy-waiting
             except Exception as e:
                 LOG.error(f"Error in writer method: {e}", exc_info=True)
         LOG.warning("Writer method ended")
 
+    async def _process_batch(self):
+        LOG.warning(f"Processing batch of {len(self.batch)} updates")
+        df = pd.DataFrame(self.batch)
+        LOG.warning(f"Created DataFrame with shape: {df.shape}")
+
+        LOG.warning("Starting field transformation")
+        self._transform_columns(df)
+        LOG.warning("Field transformation completed")
+
+        LOG.warning("Validating columns")
+        self._validate_columns(df)
+        LOG.warning("Columns validation completed")
+
+        LOG.warning("Starting batch write")
+        await self._write_batch(df)
+        LOG.warning("Batch write completed")
+
+        self.batch = []
+        self.last_flush_time = time.time()
+
     def _validate_columns(self, df: pd.DataFrame):
         LOG.debug("Validating DataFrame columns.")
         # Check for required columns
@@ -357,6 +379,9 @@ def _update_metadata(self):
     async def stop(self):
         LOG.info("Stopping DeltaLakeCallback writer.")
         self.running = False
+        # Flush any remaining data
+        if self.batch:
+            await self._process_batch()
 
     def get_version(self, timestamp: Optional[int] = None) -> Optional[int]:
         if self.time_travel:

From d649fea58743fd152b144c0ef5fb1e138925c5ec Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 02:25:43 +0200
Subject: [PATCH 72/87] fix: Update copyright year in demo_deltalake.py

---
 examples/demo_deltalake.py | 4 ++--
 setup.py                   | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py
index 5c973016e..95d2a405d 100644
--- a/examples/demo_deltalake.py
+++ b/examples/demo_deltalake.py
@@ -1,9 +1,9 @@
-'''
+"""
 Copyright (C) 2018-2024 Bryant Moscon - bmoscon@gmail.com
 
 Please see the LICENSE file for the terms and conditions
 associated with this software.
-'''
+"""
 from cryptofeed import FeedHandler
 from cryptofeed.backends.deltalake import FundingDeltaLake, TickerDeltaLake, TradeDeltaLake
 from cryptofeed.defines import FUNDING, TICKER, TRADES
diff --git a/setup.py b/setup.py
index 344573348..fe38f7b7d 100644
--- a/setup.py
+++ b/setup.py
@@ -7,10 +7,9 @@
 import os
 import sys
 
-from setuptools import Extension, setup
-from setuptools import find_packages
-from setuptools.command.test import test as TestCommand
 from Cython.Build import cythonize
+from setuptools import Extension, find_packages, setup
+from setuptools.command.test import test as TestCommand
 
 
 def get_long_description():

From bb4cb4cbe18431e95a020b494c72bf6becd3c22a Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 03:19:11 +0200
Subject: [PATCH 73/87] feat: Add common configuration for Delta Lake callbacks

---
 examples/demo_deltalake.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py
index 95d2a405d..d258cb21e 100644
--- a/examples/demo_deltalake.py
+++ b/examples/demo_deltalake.py
@@ -23,26 +23,33 @@ def main():
         "AWS_REGION": "your_region"
     }
 
+    # Common configuration for all callbacks
+    common_config = {
+        "base_path": delta_base_path,
+        "storage_options": s3_options,
+        "batch_size": 1000,  # Process in batches of 1000 records
+        "flush_interval": 60.0,  # Flush every 60 seconds if batch size not reached
+        "optimize_interval": 100000,  # Optimize after 100,000 rows written
+        "time_travel": True,
+    }
+
     # Add Binance feed with Delta Lake callbacks
     f.add_feed(Binance(
         channels=[TRADES, FUNDING, TICKER],
         symbols=['BTC-USDT', 'ETH-USDT'],
         callbacks={
             TRADES: TradeDeltaLake(
-                base_path=delta_base_path, 
-                optimize_interval=50,  # More frequent table optimization
-                time_travel=True,  # Enable time travel feature
-                storage_options=s3_options  # Add S3 configuration
+                **common_config,
+                z_order_cols=['timestamp', 'price', 'amount']
             ),
             FUNDING: FundingDeltaLake(
-                base_path=delta_base_path,
-                storage_options=s3_options  # Add S3 configuration
+                **common_config,
+                z_order_cols=['timestamp', 'rate']
             ),
             TICKER: TickerDeltaLake(
-                base_path=delta_base_path,
-                partition_cols=['exchange', 'symbol', 'year', 'month', 'day'],  # Custom partitioning
-                z_order_cols=['timestamp', 'bid', 'ask'],  # Enable Z-ordering
-                storage_options=s3_options  # Add S3 configuration
+                **common_config,
+                partition_cols=['exchange', 'symbol', 'dt'],
+                z_order_cols=['timestamp', 'bid', 'ask']
             )
         }
     ))

From 87a77f52e15ec32f75afcf0ad62aeb8177d6846d Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:42:39 +0200
Subject: [PATCH 74/87] fix: Fix logging message in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 5bfa7bd24..97ad9d80a 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -119,7 +119,7 @@ async def writer(self):
         while self.running:
             try:
                 async with self.read_queue() as updates:
-                    LOG.warning(f"Read queue returned {len(updates)} updates")
+                    LOG.warning(f"Read queue returned: {updates}")
                     if updates:
                         LOG.warning(f"Received {len(updates)} updates for processing.")
                         self.batch.extend(updates)

From 98762fdaf32309abca96926c5f2619221b94b378 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:56:33 +0200
Subject: [PATCH 75/87] refactor: Improve logging and error handling in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 97ad9d80a..c1980eb7d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -320,11 +320,14 @@ async def _write_batch(self, df: pd.DataFrame):
                 LOG.warning(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
                 )
-                LOG.debug(f"DataFrame schema:\n{df.dtypes}")
 
-                LOG.warning(
-                    f"Writing batch of {len(df)} records to {self.delta_table_path}"
-                )
+                # Moved logging statements here, just before write_deltalake
+                sample_size = min(5, len(df))  # Show up to 5 rows
+                LOG.warning(f"Sample of DataFrame to be written (first {sample_size} rows):")
+                LOG.warning(df.head(sample_size).to_string())
+                LOG.warning("DataFrame dtypes:")
+                LOG.warning(df.dtypes.to_string())
+                LOG.warning(f"Writing batch of {len(df)} records to {self.delta_table_path}")
 
                 write_deltalake(
                     self.delta_table_path,
@@ -346,16 +349,12 @@ async def _write_batch(self, df: pd.DataFrame):
                 break  # Exit the retry loop if write is successful
 
             except Exception as e:
+                LOG.error(f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}")
                 LOG.error(f"DataFrame schema:\n{df.dtypes}")
                 LOG.error(f"DataFrame:\n{df}")
-                LOG.error(
-                    f"Error writing to Delta Lake on attempt {attempt + 1}/{max_retries}: {e}"
-                )
 
                 if attempt < max_retries - 1:
-                    LOG.warning(
-                        f"Retrying in {retry_delay} seconds..."
-                    )
+                    LOG.warning(f"Retrying in {retry_delay} seconds...")
                     await asyncio.sleep(retry_delay)
                 else:
                     LOG.error(

From 53a87c793cda6970bff6ffc24592fc9c5f19661c Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 18:14:37 +0200
Subject: [PATCH 76/87] refactor: Optimize DeltaLakeCallback batch processing

---
 cryptofeed/backends/deltalake.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index c1980eb7d..405f4d264 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -39,8 +39,8 @@ def __init__(
         storage_options: Optional[Dict[str, Any]] = None,
         numeric_type: Union[type, str] = float,
         none_to: Any = None,
-        batch_size: int = 1000,
-        flush_interval: float = 60.0,
+        batch_size: int = 100,
+        flush_interval: float = 10.0,
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
@@ -138,21 +138,10 @@ async def writer(self):
         LOG.warning("Writer method ended")
 
     async def _process_batch(self):
-        LOG.warning(f"Processing batch of {len(self.batch)} updates")
         df = pd.DataFrame(self.batch)
-        LOG.warning(f"Created DataFrame with shape: {df.shape}")
-
-        LOG.warning("Starting field transformation")
         self._transform_columns(df)
-        LOG.warning("Field transformation completed")
-
-        LOG.warning("Validating columns")
         self._validate_columns(df)
-        LOG.warning("Columns validation completed")
-
-        LOG.warning("Starting batch write")
         await self._write_batch(df)
-        LOG.warning("Batch write completed")
 
         self.batch = []
         self.last_flush_time = time.time()
@@ -305,9 +294,6 @@ def _handle_missing_values(self, df: pd.DataFrame):
                 )
 
     async def _write_batch(self, df: pd.DataFrame):
-        LOG.warning(
-            f"_write_batch called with DataFrame of shape {df.shape}"
-        )
         if df.empty:
             LOG.warning("DataFrame is empty. Skipping write operation.")
             return
@@ -586,4 +572,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     - liquidity: category
     - type: category
     - account: string (nullable)
-    """
+    """
\ No newline at end of file

From acfd7ccac9e2ec2c9f09355896ba14689da76588 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 19:26:08 +0200
Subject: [PATCH 77/87] fix: Convert datetime columns to UTC and microsecond
 precision

---
 cryptofeed/backends/deltalake.py | 38 ++++++++++++++------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 405f4d264..d3c1b41da 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -11,6 +11,7 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
 
+import numpy as np
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
@@ -198,37 +199,30 @@ def _reorder_columns(self, df: pd.DataFrame):
         df = df[priority_cols + other_cols]
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
-        LOG.debug("Converting datetime columns to microsecond precision.")
-        datetime_columns = ["timestamp", "receipt_timestamp"]
-        for col in datetime_columns:
+        LOG.debug("Converting datetime columns to UTC and microsecond precision.")
+        INVALID_DATE = np.Timestamp('1900-01-01').date()
+
+        for col in ['timestamp', 'receipt_timestamp']:
             if col in df.columns:
-                # Log sample of original values
-                LOG.debug(
-                    f"Sample {col} before conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}"
-                )
-                # Convert to microsecond precision, handling both string and datetime inputs
-                df[col] = pd.to_datetime(df[col]).astype("datetime64[us]")
-                # Log sample of converted values in readable format
-                if len(df) > 0:
-                    readable_time = df[col].iloc[0].strftime("%Y-%m-%d %H:%M:%S.%f")
-                    LOG.debug(f"Sample {col} after conversion: {readable_time}")
-
-        # Create 'dt' column, prioritizing 'timestamp' over 'receipt_timestamp'
-        min_valid_date = pd.Timestamp("2000-01-01")  # Adjust this as needed
+                # Convert timestamp (seconds since epoch) to UTC datetime
+                df[col] = pd.to_datetime(df[col], unit='s', utc=True)
+                df[col] = df[col].dt.tz_localize(None)  # Remove timezone info after conversion
+                LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}")
+
+        # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE
         if "timestamp" in df.columns:
-            df["dt"] = df["timestamp"].where(df["timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date
+            df["dt"] = df["timestamp"].dt.date
         elif "receipt_timestamp" in df.columns:
-            df["dt"] = df["receipt_timestamp"].where(df["receipt_timestamp"] >= min_valid_date, pd.Timestamp.now()).dt.date
+            df["dt"] = df["receipt_timestamp"].dt.date
         else:
-            LOG.warning("No timestamp column found. Using current date for 'dt'.")
-            df["dt"] = pd.Timestamp.now().date()
+            LOG.warning("Neither timestamp nor receipt_timestamp column found. Using invalid date for 'dt'.")
+            df["dt"] = INVALID_DATE
 
         # Log sample of 'dt' column
         if "dt" in df.columns and len(df) > 0:
             LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}")
 
-        LOG.debug("Datetime columns converted to microsecond precision.")
-
+        LOG.debug("Datetime columns converted and 'dt' column created.")
     def _convert_int_columns(self, df: pd.DataFrame):
         LOG.debug("Converting integer columns.")
         int_columns = ["id", "trade_id", "trades"]

From b600513ae691d6c2bbed1f32e5a51d64217e61e5 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 19:33:51 +0200
Subject: [PATCH 78/87] feat: Convert datetime columns to UTC and microsecond
 precision

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index d3c1b41da..936b8c58f 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -200,7 +200,7 @@ def _reorder_columns(self, df: pd.DataFrame):
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
         LOG.debug("Converting datetime columns to UTC and microsecond precision.")
-        INVALID_DATE = np.Timestamp('1900-01-01').date()
+        INVALID_DATE = np.datetime64('1900-01-01').date()
 
         for col in ['timestamp', 'receipt_timestamp']:
             if col in df.columns:

From fae08d4b5ababb3ccc505a35f7115b9aeeb76b59 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 19:37:29 +0200
Subject: [PATCH 79/87] feat: Convert datetime columns to UTC and microsecond
 precision

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 936b8c58f..504b7936e 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -200,7 +200,7 @@ def _reorder_columns(self, df: pd.DataFrame):
 
     def _convert_datetime_columns(self, df: pd.DataFrame):
         LOG.debug("Converting datetime columns to UTC and microsecond precision.")
-        INVALID_DATE = np.datetime64('1900-01-01').date()
+        INVALID_DATE = pd.Timestamp('1900-01-01').date()
 
         for col in ['timestamp', 'receipt_timestamp']:
             if col in df.columns:

From b313e5e9a6291149f4959347c26d2a7d95c81f95 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 20:11:03 +0200
Subject: [PATCH 80/87] feat: Increase optimize_interval, enable time_travel,
 change numeric_type and batch_size in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 504b7936e..b2fbe8b95 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -34,13 +34,13 @@ def __init__(
         key: Optional[str] = None,
         custom_columns: Optional[Dict[str, str]] = None,
         partition_cols: Optional[List[str]] = None,
-        optimize_interval: int = 100,
+        optimize_interval: int = 1000,
         z_order_cols: Optional[List[str]] = None,
-        time_travel: bool = False,
+        time_travel: bool = True,
         storage_options: Optional[Dict[str, Any]] = None,
-        numeric_type: Union[type, str] = float,
+        numeric_type: Union[type, str] = "float64",
         none_to: Any = None,
-        batch_size: int = 100,
+        batch_size: int = 10000,
         flush_interval: float = 10.0,
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
@@ -54,7 +54,7 @@ def __init__(
         self.partition_cols = partition_cols or ["exchange", "symbol", "dt"]
         self.optimize_interval = optimize_interval
         self.z_order_cols = z_order_cols or self._default_z_order_cols()
-        self.time_travel = time_travel
+        self.time_travel = time_travel or False
         self.storage_options = storage_options or {}
         self.write_count = 0
         self.running = True

From 2261007c61b6e241984797f2ff3d75aeaaeb6691 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 20:42:31 +0200
Subject: [PATCH 81/87] fix: Ensure datetime columns are in microsecond
 precision and remove timezone info after conversion

---
 cryptofeed/backends/deltalake.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index b2fbe8b95..bfe2a7e7a 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -172,9 +172,6 @@ def _validate_columns(self, df: pd.DataFrame):
         }
         for col, expected_type in expected_types.items():
             if col in df.columns:
-                if expected_type == "datetime64[us]":
-                    # Ensure datetime columns are in microsecond precision
-                    df[col] = df[col].astype("datetime64[us]")
                 if not df[col].dtype == expected_type:
                     raise TypeError(
                         f"Column '{col}' should be of type {expected_type}, but is {df[col].dtype}"
@@ -205,8 +202,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         for col in ['timestamp', 'receipt_timestamp']:
             if col in df.columns:
                 # Convert timestamp (seconds since epoch) to UTC datetime
-                df[col] = pd.to_datetime(df[col], unit='s', utc=True)
-                df[col] = df[col].dt.tz_localize(None)  # Remove timezone info after conversion
+                df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None)
                 LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}")
 
         # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE

From 1da540f52c4740a993779fbca33d7e0ca8b2d59a Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 20:47:52 +0200
Subject: [PATCH 82/87] feat: Change numeric_type to float in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index bfe2a7e7a..3b4a6a46d 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -38,7 +38,7 @@ def __init__(
         z_order_cols: Optional[List[str]] = None,
         time_travel: bool = True,
         storage_options: Optional[Dict[str, Any]] = None,
-        numeric_type: Union[type, str] = "float64",
+        numeric_type: Union[type, str] = float,
         none_to: Any = None,
         batch_size: int = 10000,
         flush_interval: float = 10.0,

From 3d1a84cc7ad6e32379b258c9792fcd25ac0f59a4 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 21:01:56 +0200
Subject: [PATCH 83/87] fix: Ensure datetime columns are in microsecond
 precision and convert timestamp columns to UTC datetime

---
 cryptofeed/backends/deltalake.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 3b4a6a46d..2a9364fc0 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -171,6 +171,9 @@ def _validate_columns(self, df: pd.DataFrame):
             "receipt_timestamp": "datetime64[us]",
         }
         for col, expected_type in expected_types.items():
+            if expected_type == "datetime64[us]":
+                # Ensure datetime columns are in microsecond precision
+                df[col] = df[col].astype("datetime64[us]")
             if col in df.columns:
                 if not df[col].dtype == expected_type:
                     raise TypeError(
@@ -202,7 +205,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         for col in ['timestamp', 'receipt_timestamp']:
             if col in df.columns:
                 # Convert timestamp (seconds since epoch) to UTC datetime
-                df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None)
+                df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None).astype("datetime64[us, UTC]")
                 LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}")
 
         # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE

From fabc822f9a7c143331d58b4173de873068e6892b Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 21:10:31 +0200
Subject: [PATCH 84/87] fix: Convert timestamp columns to datetime64[ns]
 instead of datetime64[us, UTC]

---
 cryptofeed/backends/deltalake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 2a9364fc0..1ee3bd623 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -205,7 +205,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
         for col in ['timestamp', 'receipt_timestamp']:
             if col in df.columns:
                 # Convert timestamp (seconds since epoch) to UTC datetime
-                df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None).astype("datetime64[us, UTC]")
+                df[col] = pd.to_datetime(df[col], unit='s', utc=True).dt.tz_localize(None)
                 LOG.debug(f"Sample {col} after conversion: {df[col].iloc[0] if len(df) > 0 else 'N/A'}")
 
         # Create 'dt' column, prioritizing 'timestamp', then 'receipt_timestamp', fallback to INVALID_DATE

From 47ca6a4be0c068d092002ce12232758da6c02de8 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Mon, 9 Sep 2024 23:02:01 +0200
Subject: [PATCH 85/87] fix: Update logging levels in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 1ee3bd623..0022e9b98 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -11,7 +11,6 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Union
 
-import numpy as np
 import pandas as pd
 from deltalake import DeltaTable, write_deltalake
 
@@ -45,7 +44,7 @@ def __init__(
         custom_transformations: Optional[List[callable]] = None,
         **kwargs: Any,
     ):
-        LOG.warning("Initializing DeltaLakeCallback")
+        LOG.debug("Initializing DeltaLakeCallback")
         super().__init__()
         self.key = key or self.default_key
         self.base_path = base_path
@@ -116,13 +115,13 @@ def _default_z_order_cols(self) -> List[str]:
         return [col for col in z_order_cols if col not in self.partition_cols]
 
     async def writer(self):
-        LOG.warning("Writer method started")
+        LOG.debug("Writer method started")
         while self.running:
             try:
                 async with self.read_queue() as updates:
-                    LOG.warning(f"Read queue returned: {updates}")
+                    LOG.debug(f"Read queue returned: {updates}")
                     if updates:
-                        LOG.warning(f"Received {len(updates)} updates for processing.")
+                        LOG.debug(f"Received {len(updates)} updates for processing.")
                         self.batch.extend(updates)
 
                         if len(self.batch) >= self.batch_size or (time.time() - self.last_flush_time) >= self.flush_interval:
@@ -132,11 +131,11 @@ async def writer(self):
                         if (time.time() - self.last_flush_time) >= self.flush_interval and self.batch:
                             await self._process_batch()
                         else:
-                            LOG.warning("No updates received, continuing loop")
+                            LOG.debug("No updates received, continuing loop")
                             await asyncio.sleep(1)  # Add a small delay to prevent busy-waiting
             except Exception as e:
                 LOG.error(f"Error in writer method: {e}", exc_info=True)
-        LOG.warning("Writer method ended")
+        LOG.debug("Writer method ended")
 
     async def _process_batch(self):
         df = pd.DataFrame(self.batch)
@@ -296,16 +295,16 @@ async def _write_batch(self, df: pd.DataFrame):
 
         for attempt in range(max_retries):
             try:
-                LOG.warning(
+                LOG.debug(
                     f"Attempting to write batch to Delta Lake (Attempt {attempt + 1}/{max_retries})."
                 )
 
-                # Moved logging statements here, just before write_deltalake
+                # Logging statements just before write_deltalake
                 sample_size = min(5, len(df))  # Show up to 5 rows
-                LOG.warning(f"Sample of DataFrame to be written (first {sample_size} rows):")
-                LOG.warning(df.head(sample_size).to_string())
-                LOG.warning("DataFrame dtypes:")
-                LOG.warning(df.dtypes.to_string())
+                LOG.debug(f"Sample of DataFrame to be written (first {sample_size} rows):")
+                LOG.debug(df.head(sample_size).to_string())
+                LOG.debug("DataFrame dtypes:")
+                LOG.debug(df.dtypes.to_string())
                 LOG.warning(f"Writing batch of {len(df)} records to {self.delta_table_path}")
 
                 write_deltalake(
@@ -341,14 +340,14 @@ async def _write_batch(self, df: pd.DataFrame):
                     )
 
     async def _optimize_table(self):
-        LOG.warning(
+        LOG.debug(
             f"Running OPTIMIZE on table {self.delta_table_path}"
         )
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)
         dt.optimize.compact()
         if self.z_order_cols:
             dt.optimize.z_order(self.z_order_cols)
-        LOG.warning("OPTIMIZE operation completed.")
+        LOG.debug("OPTIMIZE operation completed.")
 
     def _update_metadata(self):
         dt = DeltaTable(self.delta_table_path, storage_options=self.storage_options)

From cf0004a24d887a86c27364f01145858053033eb0 Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 10 Sep 2024 00:51:33 +0200
Subject: [PATCH 86/87] fix: Remove unnecessary logging statements in
 DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index 0022e9b98..fea97730f 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -300,9 +300,9 @@ async def _write_batch(self, df: pd.DataFrame):
                 )
 
                 # Logging statements just before write_deltalake
-                sample_size = min(5, len(df))  # Show up to 5 rows
-                LOG.debug(f"Sample of DataFrame to be written (first {sample_size} rows):")
-                LOG.debug(df.head(sample_size).to_string())
+                # sample_size = min(5, len(df))  # Show up to 5 rows
+                # LOG.debug(f"Sample of DataFrame to be written (first {sample_size} rows):")
+                # LOG.debug(df.head(sample_size).to_string())
                 LOG.debug("DataFrame dtypes:")
                 LOG.debug(df.dtypes.to_string())
                 LOG.warning(f"Writing batch of {len(df)} records to {self.delta_table_path}")
@@ -564,4 +564,4 @@ class FillsDeltaLake(DeltaLakeCallback, BackendCallback):
     - liquidity: category
     - type: category
     - account: string (nullable)
-    """
\ No newline at end of file
+    """

From c4d7b239b9012e864d6ce4f30c63a785197ec99a Mon Sep 17 00:00:00 2001
From: Tommy K <140900186+tommy-ca@users.noreply.github.com>
Date: Tue, 10 Sep 2024 01:16:17 +0200
Subject: [PATCH 87/87] fix: Convert integer columns in DeltaLakeCallback

---
 cryptofeed/backends/deltalake.py | 1 +
 examples/demo_deltalake.py       | 6 +++---
 setup.py                         | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cryptofeed/backends/deltalake.py b/cryptofeed/backends/deltalake.py
index fea97730f..7ebe520ae 100644
--- a/cryptofeed/backends/deltalake.py
+++ b/cryptofeed/backends/deltalake.py
@@ -221,6 +221,7 @@ def _convert_datetime_columns(self, df: pd.DataFrame):
             LOG.debug(f"Sample 'dt' value: {df['dt'].iloc[0]}")
 
         LOG.debug("Datetime columns converted and 'dt' column created.")
+
     def _convert_int_columns(self, df: pd.DataFrame):
         LOG.debug("Converting integer columns.")
         int_columns = ["id", "trade_id", "trades"]
diff --git a/examples/demo_deltalake.py b/examples/demo_deltalake.py
index d258cb21e..8d3c46cbd 100644
--- a/examples/demo_deltalake.py
+++ b/examples/demo_deltalake.py
@@ -12,7 +12,7 @@
 
 def main():
     f = FeedHandler()
-    
+
     # Define the Delta Lake base path (can be local or S3)
     delta_base_path = 's3://your-bucket/path/to/delta/tables'
 
@@ -53,9 +53,9 @@ def main():
             )
         }
     ))
-    
+
     f.run()
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/setup.py b/setup.py
index fe38f7b7d..0542ef42c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,9 @@
-'''
+"""
 Copyright (C) 2017-2024 Bryant Moscon - bmoscon@gmail.com
 
 Please see the LICENSE file for the terms and conditions
 associated with this software.
-'''
+"""
 import os
 import sys