Page limit Parameter #38

firmai · 2024-08-08T12:56:31Z

Hi team, it would be helpful to expose a page limit parameter to users that would allow for generic requests like 'climate' to stop after a certain amount of pages, if so chosen by the user, this could be the expected behaviour:

class EdgarTextSearcher:
    
    #---> perhaps adjust like this? 

    def _fetch_search_request_results(
        self,
        search_request_url_args: str,
        min_wait_seconds: float,
        max_wait_seconds: float,
        retries: int,
        max_pages: Optional[int] = None  # New parameter
    ) -> Iterator[Iterator[Dict[str, Any]]]:
        """
        Fetches the results for the given search request and paginates through the results.

        :param search_request_url_args: URL-encoded request arguments string to concatenate to the SEC website URL
        :param min_wait_seconds: minimum number of seconds to wait for the request to complete
        :param max_wait_seconds: maximum number of seconds to wait for the request to complete
        :param retries: number of times to retry the request before failing
        :param max_pages: maximum number of pages to fetch (optional)
        :return: Iterator of dictionaries representing the parsed table rows
        """

        # Fetch first page, verify that the request was successful by checking the results table appears on the page
        self.json_response = fetch_page(
            f"{TEXT_SEARCH_BASE_URL}{search_request_url_args}",
            min_wait_seconds,
            max_wait_seconds,
            retries,
        )(
            lambda json_response: json_response.get("error") is None
            and json_response.get("hits", {}).get("hits", 0) != 0,
            f"First search request failed for URL {TEXT_SEARCH_BASE_URL}{search_request_url_args} ...",
        )

        # Get number of pages
        num_pages = self._compute_number_of_pages()
        
        # Limit the number of pages if max_pages is specified
        if max_pages is not None:
            num_pages = min(num_pages, max_pages)
            print(f"Limiting search to {num_pages} pages")

        for i in range(1, num_pages + 1):
            paginated_url = f"{TEXT_SEARCH_BASE_URL}{search_request_url_args}&page={i}&from={100*(i-1)}"
            try:
                self.json_response = fetch_page(
                    paginated_url,
                    min_wait_seconds,
                    max_wait_seconds,
                    retries,
                )(
                    lambda json_response: json_response.get("error") is None,
                    f"Search request failed for page {i} at URL {paginated_url}, skipping page...",
                )
                if self.json_response.get("hits", {}).get("hits", 0) == 0:
                    raise ResultsTableNotFoundError()
                page_results = self._parse_table_rows(paginated_url)
                yield page_results
            except PageCheckFailedError as e:
                print(e)
                continue
            except ResultsTableNotFoundError:
                print(
                    f"Could not find results table on page {i} at URL {paginated_url}, skipping page..."
                )
                continue
            except Exception as e:
                print(
                    f"Unexpected {e.__class__.__name__} error occurred while fetching page {i} at URL {paginated_url}, skipping page: {e}"
                )
                continue

    def text_search(
        self,
        keywords: List[str],
        entity_id: Optional[str],
        filing_form: Optional[str],
        single_forms: Optional[str],
        start_date: date,
        end_date: date,
        min_wait_seconds: float,
        max_wait_seconds: float,
        retries: int,
        destination: str,
        peo_in: Optional[str],
        inc_in: Optional[str],
        max_pages: Optional[int] = None  # New parameter
    ) -> None:
        """
        Searches the SEC website for filings based on the given parameters.

        :param keywords: Search keywords to input in the "Document word or phrase" field
        :param entity_id: Entity/Person name, ticker, or CIK number to input in the "Company name, ticker, or CIK" field
        :param filing_form: Group to select within the filing category dropdown menu, defaults to None
        :param single_forms: List of single forms to search for (e.g. ['10-K', '10-Q']), defaults to None
        :param start_date: Start date for the custom date range
        :param end_date: End date for the custom date range
        :param min_wait_seconds: Minimum number of seconds to wait for the request to complete
        :param max_wait_seconds: Maximum number of seconds to wait for the request to complete
        :param retries: Number of times to retry the request before failing
        :param destination: Name of the CSV file to write the results to
        :param peo_in: Search principal executive offices in a location (e.g. "NY,OH")
        :param inc_in: Search incorporated in a location (e.g. "NY,OH")
        :param max_pages: Maximum number of pages to fetch (optional)
        """
        self._generate_search_requests(
            keywords=keywords,
            entity_id=entity_id,
            filing_form=filing_form,
            single_forms=single_forms,
            start_date=start_date,
            end_date=end_date,
            min_wait_seconds=min_wait_seconds,
            max_wait_seconds=max_wait_seconds,
            retries=retries,
            peo_in=peo_in,
            inc_in=inc_in,
        )

        search_requests_results: List[Iterator[Iterator[Dict[str, Any]]]] = []
        for r in self.search_requests:

            # Run generated search requests and paginate through results
            try:
                all_pages_results: Iterator[Iterator[Dict[str, Any]]] = (
                    self._fetch_search_request_results(
                        search_request_url_args=r,
                        min_wait_seconds=min_wait_seconds,
                        max_wait_seconds=max_wait_seconds,
                        retries=retries,
                        max_pages=max_pages  # Pass the new parameter
                    )
                )
                search_requests_results.append(all_pages_results)

            except Exception as e:
                print(
                    f"Skipping search request due to an unexpected {e.__class__.__name__} for request parameters '{r}': {e}"
                )
        if(search_requests_results == []):
            raise NoResultsFoundError(f"No results found for the search query")
        write_results_to_file(
            itertools.chain(*search_requests_results),
            destination,
            TEXT_SEARCH_CSV_FIELDS_NAMES,
        )

and then called like this

edgar_searcher = EdgarTextSearcher()
edgar_searcher.text_search(
    keywords=["Resignations"],
    entity_id=None,
    filing_form=None,
    single_forms=None,
    start_date=date(2024, 1, 1),
    end_date=date(2024, 12, 31),
    min_wait_seconds=0.1,
    max_wait_seconds=0.5,
    retries=3,
    destination="results.csv",
    peo_in=None,
    inc_in=None,
    max_pages=5  # This will limit the search to the first 5 pages
)

Thanks for the nice tool!

The text was updated successfully, but these errors were encountered:

firmai · 2024-08-08T15:15:17Z

Just playing and realising a max wait time parameter could also be very valuable.

GalenReich added the enhancement New feature or request label Aug 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Page limit Parameter #38

Page limit Parameter #38

firmai commented Aug 8, 2024 •

edited

Loading

firmai commented Aug 8, 2024

Page limit Parameter #38

Page limit Parameter #38

Comments

firmai commented Aug 8, 2024 • edited Loading

firmai commented Aug 8, 2024

firmai commented Aug 8, 2024 •

edited

Loading