Skip to content

Commit

Permalink
Refactor Players Search parsing (#55)
Browse files Browse the repository at this point in the history
* Refactor Players Search result parsing

* Improve utils functions

* Update unit tests
  • Loading branch information
felipeall authored Jan 11, 2024
1 parent 7b97d3e commit 0d7bf48
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 49 deletions.
61 changes: 28 additions & 33 deletions app/services/players/search.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from dataclasses import dataclass
from datetime import datetime
from xml.etree import ElementTree

from app.services.base import TransfermarktBase
from app.utils.regex import REGEX_CHART_CLUB_ID
from app.utils.utils import extract_from_url, safe_regex
from app.utils.utils import extract_from_url, safe_regex, trim
from app.utils.xpath import Players


Expand Down Expand Up @@ -38,41 +39,35 @@ def __parse_search_results(self) -> list:
Returns:
list: A list of dictionaries, with each dictionary representing a player search result.
"""
idx = [extract_from_url(url) for url in self.get_list_by_xpath(Players.Search.URL)]
name = self.get_list_by_xpath(Players.Search.NAME)
position = self.get_list_by_xpath(Players.Search.POSITION)
club_name = self.get_list_by_xpath(Players.Search.CLUB_NAME)
club_id = [
safe_regex(img, REGEX_CHART_CLUB_ID, "club_id") for img in self.get_list_by_xpath(Players.Search.CLUB_IMAGE)
]
age = self.get_list_by_xpath(Players.Search.AGE)
nationality = self.get_list_by_xpath(Players.Search.NATIONALITY)
market_value = self.get_list_by_xpath(Players.Search.MARKET_VALUE)
search_results: list[ElementTree] = self.page.xpath(Players.Search.RESULTS)
results = []

return [
{
"id": idx,
"name": name,
"position": position,
"club": {
"id": club_id,
"name": club_name,
for result in search_results:
idx = extract_from_url(result.xpath(Players.Search.ID))
name = trim(result.xpath(Players.Search.NAME))
position = trim(result.xpath(Players.Search.POSITION))
club_name = trim(result.xpath(Players.Search.CLUB_NAME))
club_id = safe_regex(result.xpath(Players.Search.CLUB_IMAGE), REGEX_CHART_CLUB_ID, "club_id")
age = trim(result.xpath(Players.Search.AGE))
nationalities = result.xpath(Players.Search.NATIONALITIES)
market_value = trim(result.xpath(Players.Search.MARKET_VALUE))

results.append(
{
"id": idx,
"name": name,
"position": position,
"club": {
"name": club_name,
"id": club_id,
},
"age": age,
"nationalities": nationalities,
"marketValue": market_value,
},
"age": age,
"nationality": nationality,
"marketValue": market_value,
}
for idx, name, club_id, club_name, position, age, nationality, market_value in zip(
idx,
name,
club_id,
club_name,
position,
age,
nationality,
market_value,
)
]

return results

def search_players(self) -> dict:
"""
Expand Down
14 changes: 9 additions & 5 deletions app/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def zip_lists_into_dict(list_keys: list, list_values: list) -> dict:
return {k: v for k, v in zip(list_keys, list_values)}


def extract_from_url(tfmkt_url: str, element: str = "id") -> Optional[str]:
def extract_from_url(tfmkt_url: Optional[str], element: str = "id") -> Optional[str]:
"""
Extract a specific element from a Transfermarkt URL using regular expressions.
Expand All @@ -48,6 +48,9 @@ def extract_from_url(tfmkt_url: str, element: str = "id") -> Optional[str]:
Returns:
Optional[str]: The extracted element value or None if not found.
"""
if not tfmkt_url:
return None

regex: str = (
r"/(?P<code>[\w%-]+)"
r"/(?P<category>[\w-]+)"
Expand All @@ -56,8 +59,9 @@ def extract_from_url(tfmkt_url: str, element: str = "id") -> Optional[str]:
r"(/saison_id/(?P<season_id>\d{4}))?"
r"(/transfer_id/(?P<transfer_id>\d+))?"
)

try:
groups: dict = re.match(regex, tfmkt_url).groupdict()
groups: dict = re.match(regex, trim(tfmkt_url)).groupdict()
except TypeError:
return None
return groups.get(element)
Expand All @@ -79,7 +83,7 @@ def trim(text: Union[list, str]) -> str:
return text.strip().replace("\xa0", "")


def safe_regex(text: Optional[str], regex, group: str) -> Optional[str]:
def safe_regex(text: Optional[Union[str, list]], regex, group: str) -> Optional[str]:
"""
Safely apply a regular expression and extract a specific group from the matched text.
Expand All @@ -91,11 +95,11 @@ def safe_regex(text: Optional[str], regex, group: str) -> Optional[str]:
Returns:
Optional[str]: The extracted group value or None if not found or if the input is not a string.
"""
if not isinstance(text, str):
if not isinstance(text, (str, list)) or not text:
return None

try:
groups = re.search(regex, text).groupdict()
groups = re.search(regex, trim(text)).groupdict()
return groups.get(group)
except AttributeError:
return None
Expand Down
19 changes: 10 additions & 9 deletions app/utils/xpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,17 @@ class Profile:
SOCIAL_MEDIA = "//div[@class='socialmedia-icons']//@href"

class Search:
BASE = "//div[@class='box'][h2[contains(text(), 'players')]]"
FOUND = "//text()"
URL = BASE + "//td[@class='hauptlink']//a//@href"
NAME = BASE + "//td[@class='hauptlink']//a//@title"
POSITION = BASE + "//td[@class='zentriert'][1]//text()"
CLUB_IMAGE = BASE + "//td[@class='zentriert'][2]//img//@src"
CLUB_NAME = BASE + "//img[@class='tiny_wappen']//@title"
AGE = BASE + "//td[@class='zentriert'][3]//text()"
NATIONALITY = BASE + "//img//@title"
MARKET_VALUE = BASE + "//td[@class='rechts hauptlink']//text()"
BASE = "//div[@class='box'][h2[contains(text(), 'players')]]"
RESULTS = BASE + "//tbody//tr[@class='odd' or @class='even']"
ID = ".//td[@class='hauptlink']//a/@href"
NAME = ".//td[@class='hauptlink']//a//@title"
POSITION = ".//td[@class='zentriert'][1]//text()"
CLUB_NAME = ".//img[@class='tiny_wappen']//@title"
CLUB_IMAGE = ".//img[@class='tiny_wappen']//@src"
AGE = ".//td[@class='zentriert'][3]//text()"
NATIONALITIES = ".//img[@class='flaggenrahmen']/@title"
MARKET_VALUE = ".//td[@class='rechts hauptlink']//text()"

class MarketValue:
URL = "//a[@class='data-header__market-value-wrapper']//@href"
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def regex_value_variation():

@pytest.fixture
def regex_integer():
return Regex(r"^\d+$")
return Regex(r"^(\d+|-)$")


@pytest.fixture
Expand Down
2 changes: 1 addition & 1 deletion tests/players/test_players_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_players_search(query, page_number, len_greater_than_0, regex_integer, r
"name": And(str, len_greater_than_0),
},
"age": And(str, len_greater_than_0, regex_integer),
"nationality": And(str, len_greater_than_0),
"nationalities": And(list, len_greater_than_0),
"marketValue": And(str, len_greater_than_0, regex_market_value),
},
],
Expand Down

0 comments on commit 0d7bf48

Please sign in to comment.