-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c39a26f
commit dcc16b1
Showing
2 changed files
with
47 additions
and
181 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +0,0 @@ | ||
"""Scraping source using scrapy""" | ||
from typing import Any, Iterator, List, TypeVar | ||
from queue import Empty | ||
|
||
import dlt | ||
|
||
from dlt.common import logger | ||
from dlt.common.configuration.inject import with_config | ||
|
||
from .helpers import ScrapingConfig | ||
from .queue import BaseQueue, QueueClosedError | ||
|
||
|
||
T = TypeVar("T") | ||
|
||
|
||
@dlt.source | ||
def scrapy_source(queue: BaseQueue[T]) -> Iterator[Any]: | ||
yield scrapy_resource(queue) | ||
|
||
|
||
@with_config(sections=("sources", "scraping"), spec=ScrapingConfig) | ||
def scrapy_resource( | ||
queue: BaseQueue[T], | ||
batch_size: int = dlt.config.value, | ||
queue_result_timeout: int = dlt.config.value, | ||
) -> Iterator[Any]: | ||
"""Scrapy resource to retrieve scraped items from the queue | ||
Args: | ||
queue(BaseQueue[T]): Queue instance | ||
queue_result_timeout(int): timeout to wait for items in the queue | ||
Returns: | ||
Iterator[Any]: yields scraped items one by one | ||
""" | ||
batch: List[T] = [] | ||
num_batches = 0 | ||
while True: | ||
if len(batch) >= batch_size: | ||
num_batches += 1 | ||
yield batch | ||
batch = [] | ||
|
||
try: | ||
if queue.is_closed: | ||
raise QueueClosedError("Queue is closed") | ||
|
||
result = queue.get(timeout=queue_result_timeout) | ||
batch.append(result) | ||
|
||
# Mark task as completed | ||
queue.task_done() | ||
except Empty: | ||
logger.info(f"Queue has been empty for {queue_result_timeout}s...") | ||
|
||
# Return the current batch | ||
if batch: | ||
num_batches += 1 | ||
yield batch | ||
batch = [] | ||
except QueueClosedError: | ||
logger.info("Queue is closed, stopping...") | ||
|
||
# Return the last batch before exiting | ||
if batch: | ||
num_batches += 1 | ||
yield batch | ||
|
||
logger.info(f"Loaded {num_batches} batches") | ||
|
||
break | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters