Skip to content

Commit

Permalink
Fix get_data_source_max_date_copa
Browse files Browse the repository at this point in the history
  • Loading branch information
Winzen committed Sep 10, 2024
1 parent 8cee0d2 commit e084ff5
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 10 deletions.
7 changes: 2 additions & 5 deletions pipelines/datasets/mundo_transfermarkt_competicoes/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
from pipelines.constants import constants

###############################################################################
from pipelines.datasets.mundo_transfermarkt_competicoes.constants import (
constants as mundo_constants,
)
from pipelines.datasets.mundo_transfermarkt_competicoes.schedules import (
every_day_brasileirao,
every_day_copa,
Expand All @@ -23,7 +20,7 @@
execucao_coleta_sync,
make_partitions,
get_data_source_transfermarkt_max_date,
get_data_source_max_date
get_data_source_max_date_copa
)
from pipelines.utils.constants import constants as utils_constants
from pipelines.utils.decorators import Flow
Expand Down Expand Up @@ -149,7 +146,7 @@
prefix="Dump: ", dataset_id=dataset_id, table_id=table_id, wait=table_id
)

data_source_max_date = get_data_source_max_date()
data_source_max_date = get_data_source_max_date_copa()

outdated = check_if_data_is_outdated(
dataset_id=dataset_id,
Expand Down
10 changes: 5 additions & 5 deletions pipelines/datasets/mundo_transfermarkt_competicoes/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
max_retries=1,
retry_delay=timedelta(seconds=60),
)
def get_data_source_max_date() -> datetime:
def get_data_source_max_date_copa() -> datetime:

season = mundo_constants.SEASON.value
base_url = f"https://www.transfermarkt.com/copa-do-brasil/gesamtspielplan/pokalwettbewerb/BRC/saison_id/{season}"
base_url = f"https://www.transfermarkt.com.br/copa-do-brasil/gesamtspielplan/pokalwettbewerb/BRC/saison_id/{season}"

headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
Expand All @@ -44,15 +44,15 @@ def get_data_source_max_date() -> datetime:

soup = BeautifulSoup(html.text)

pattern = r'\b[A-Za-z]{3}\s+\d{2},\s+\d{4}\b'
pattern = r'\d+/\d+/\d+'

datas = [re.findall(pattern, element.text)[0]
for element in soup.select("tr:not([class]) td.hide-for-small")
if re.findall(pattern, element.text)]

ultima_data = max([datetime.strptime(data, "%b %d, %Y")
ultima_data = max([datetime.strptime(data, "%d/%m/%Y")
for data in datas
if datetime.strptime(data, "%b %d, %Y") <= datetime.today()])
if datetime.strptime(data, "%d/%m/%Y") <= datetime.today()])
return ultima_data


Expand Down

0 comments on commit e084ff5

Please sign in to comment.