Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolution of various bugs and general maintenance of the project #216

Merged
merged 10 commits into from
May 29, 2024
4 changes: 2 additions & 2 deletions google_play_scraper/constants/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def extract_content(self, source: dict) -> Any:

def extract_categories(s, categories=None):
# Init an empty list if first iteration
if categories == None:
if categories is None:
categories = []
if s == None or len(s) == 0:
if s is None or len(s) == 0:
return categories

if len(s) >= 4 and type(s[0]) is str:
Expand Down
10 changes: 5 additions & 5 deletions google_play_scraper/constants/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@


class Regex:
NOT_NUMBER = re.compile("[^\d]")
SCRIPT = re.compile("AF_initDataCallback[\s\S]*?<\/script")
NOT_NUMBER = re.compile(r"\D")
SCRIPT = re.compile(r"AF_initDataCallback[\s\S]*?</script")
KEY = re.compile("(ds:.*?)'")
VALUE = re.compile("data:([\s\S]*?), sideChannel: {}}\);<\/")
REVIEWS = re.compile("\)]}'\n\n([\s\S]+)")
PERMISSIONS = re.compile("\)]}'\n\n([\s\S]+)")
VALUE = re.compile(r"data:([\s\S]*?), sideChannel: {}}\);<\/")
REVIEWS = re.compile(r"\)]}'\n\n([\s\S]+)")
PERMISSIONS = re.compile(r"\)]}'\n\n([\s\S]+)")
6 changes: 4 additions & 2 deletions google_play_scraper/features/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ def parse_dom(dom: str, app_id: str, url: str) -> Dict[str, Any]:

for k, spec in ElementSpecs.Detail.items():
content = spec.extract_content(dataset)

result[k] = content
if content is None:
result[k] = spec.fallback_value
else:
result[k] = content

result["appId"] = app_id
result["url"] = url
Expand Down
15 changes: 12 additions & 3 deletions google_play_scraper/features/reviews.py

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why MAX_COUNT_EACH_FETCH limited only to 4500?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the maximum limit supported by the Playstore API (you can try with a higher number, for example 4501 and it does not work, but with 4500 it does)

Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

MAX_COUNT_EACH_FETCH = 199
MAX_COUNT_EACH_FETCH = 4500


class _ContinuationToken:
Expand Down Expand Up @@ -56,8 +56,15 @@ def _fetch_review_items(
{"content-type": "application/x-www-form-urlencoded"},
)
match = json.loads(Regex.REVIEWS.findall(dom)[0])
try:
token = json.loads(match[0][2])[-2][-1]
except:
token = None

return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]
results = json.loads(match[0][2])
if len(results) == 0 or len(results[0]) == 0:
return [], token
return results[0], token


def reviews(
Expand Down Expand Up @@ -113,7 +120,7 @@ def reviews(
filter_device_with,
token,
)
except (TypeError, IndexError):
except Exception:
token = None
break

Expand All @@ -130,6 +137,8 @@ def reviews(
if isinstance(token, list):
token = None
break
if token is None:
break

return (
result,
Expand Down
23 changes: 22 additions & 1 deletion google_play_scraper/utils/request.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import ssl
import time
from typing import Union
from urllib.error import HTTPError
from urllib.request import Request, urlopen

from google_play_scraper.exceptions import ExtraHTTPError, NotFoundError

ssl._create_default_https_context = ssl._create_unverified_context

MAX_RETRIES = 3
RATE_LIMIT_DELAY = 5


def _urlopen(obj):
try:
Expand All @@ -20,7 +27,21 @@ def _urlopen(obj):


def post(url: str, data: Union[str, bytes], headers: dict) -> str:
return _urlopen(Request(url, data=data, headers=headers))
last_exception = None
rate_exceeded_count = 0
for _ in range(MAX_RETRIES):
try:
resp = _urlopen(Request(url, data=data, headers=headers))
except Exception as e:
last_exception = e
continue
if 'com.google.play.gateway.proto.PlayGatewayError' in resp:
rate_exceeded_count += 1
last_exception = Exception('com.google.play.gateway.proto.PlayGatewayError')
time.sleep(RATE_LIMIT_DELAY*rate_exceeded_count)
continue
return resp
raise last_exception


def get(url: str) -> str:
Expand Down
12 changes: 6 additions & 6 deletions tests/e2e_tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ def test_e2e_scenario_1(self):
self.assertEqual("GAME_SIMULATION", result["genreId"])
self.assertTrue(result["categories"])
self.assertGreaterEqual(len(result["categories"]), 1)
self.assertEqual("Action", result["categories"][0]["name"])
self.assertEqual("GAME_ACTION", result["categories"][0]["id"])
self.assertEqual("Simulation", result["categories"][0]["name"])
self.assertEqual("GAME_SIMULATION", result["categories"][0]["id"])
self.assertEqual(
"https://play-lh.googleusercontent.com/5nPD6fyJaa-EDLHdlBd9UsaAV8KkfrYvLB956eQsvIGNBWUrPeouYw8aa7kbCbY--6E",
result["icon"],
Expand All @@ -82,9 +82,9 @@ def test_e2e_scenario_1(self):
self.assertTrue(result["adSupported"])
self.assertTrue(result["containsAds"])
self.assertEqual("Jan 7, 2014", result["released"])
self.assertEqual(1671717276, result["updated"])
self.assertEqual(1692642233, result["updated"])
self.assertEqual("Varies with device", result["version"])
self.assertTrue(result["comments"])
self.assertFalse(result["comments"])
# self.assertTrue(result["similarApps"])
# self.assertTrue(result["moreByDeveloper"])

Expand All @@ -107,7 +107,7 @@ def test_e2e_scenario_3(self):
res = app("com.sgn.pandapop.gp")

self.assertEqual(
"https://www.youtube.com/embed/lzthjLXbZr0?ps=play&vq=large&rel=0&autohide=1&showinfo=0",
"https://www.youtube.com/embed/pw9e5aIoznY?ps=play&vq=large&rel=0&autohide=1&showinfo=0",
res["video"],
)
self.assertEqual(
Expand All @@ -122,7 +122,7 @@ def test_e2e_scenario_4(self):
res = app("com.simplemobiletools.gallery.pro")

self.assertFalse(res["free"])
self.assertEqual(1.59, res["price"])
self.assertEqual(2.99, res["price"])

# TODO free app / non free app 구분

Expand Down
1 change: 0 additions & 1 deletion tests/e2e_tests/test_permissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def test_reply_data_only_other_type(self):
"Other": [
"control vibration",
"full network access",
"run at startup",
"prevent device from sleeping",
"view network connections",
],
Expand Down
13 changes: 6 additions & 7 deletions tests/e2e_tests/test_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ def test_sort_by_newest(self):
self.assertTrue(r["content"])
self.assertTrue(r["score"] >= 1)
self.assertTrue(r["thumbsUpCount"] >= 0)
self.assertTrue(r["appVersion"])

# self.assertTrue(r["appVersion"]) # FIXME: appVersion is not always available
self.assertTrue(
datetime.now() - timedelta(days=7) < r["at"] < datetime.now()
r["at"] < datetime.now()
)

if r["reviewCreatedVersion"]:
Expand Down Expand Up @@ -127,7 +126,7 @@ def test_review_count_is_under_count_of_first_request(self):
tests length of results of first request is lower than specified count argument
"""

result, ct = reviews("com.ekkorr.endlessfrontier")
result, ct = reviews("com.docentepro.simuladordocentepro")

self.assertTrue(len(result) < 100)

Expand All @@ -138,7 +137,7 @@ def test_continuation_token(self):
tests continuation_token parameter
"""

result, continuation_token = reviews("com.mojang.minecraftpe")
result, continuation_token = reviews("com.mojang.minecraftpe", count=100)

self.assertEqual(100, len(result))
self.assertIsNotNone(continuation_token)
Expand Down Expand Up @@ -211,7 +210,7 @@ def test_priority_between_preserved_argument_of_continuation_token_and_specified
_ = reviews(
"com.mojang.minecraftpe",
continuation_token=_ContinuationToken(
"", "ko", "kr", Sort.MOST_RELEVANT, 10, 5
"", "ko", "kr", Sort.MOST_RELEVANT, 10, 5, None
),
lang="jp",
country="jp",
Expand All @@ -229,7 +228,7 @@ def test_invalid_continuation_token(self):
result, ct = reviews(
"com.mojang.minecraftpe",
continuation_token=_ContinuationToken(
"foo", "ko", "kr", Sort.MOST_RELEVANT, 10, 5
"foo", "ko", "kr", Sort.MOST_RELEVANT, 10, 5, None
),
)

Expand Down
5 changes: 2 additions & 3 deletions tests/e2e_tests/test_reviews_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class TestReviewsAll(TestCase):
def test_request_once(self):
with patch(
"google_play_scraper.features.reviews.reviews", wraps=reviews
"google_play_scraper.features.reviews.reviews", wraps=reviews
) as mock_reviews:
result = reviews_all("co.kr.uaram.userdeliver_")
self.assertEqual(1, mock_reviews.call_count)
Expand All @@ -19,10 +19,9 @@ def test_request_once(self):

def test_request_multiple_times(self):
with patch(
"google_play_scraper.features.reviews.reviews", wraps=reviews
"google_play_scraper.features.reviews.reviews", wraps=reviews
) as mock_reviews:
result = reviews_all("co.kr.uaram.userdeliver_", lang="ko", country="kr")
self.assertEqual(3, mock_reviews.call_count)

result_of_reviews, _ = reviews(
"co.kr.uaram.userdeliver_", lang="ko", country="kr", count=10000
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e_tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_e2e_scenario_1(self):
self.assertEqual("Niantic, Inc.", result["developer"])
self.assertEqual("Adventure", result["genre"])
self.assertEqual(
"https://play-lh.googleusercontent.com/3UpKaqsS-3LDEQJqoNLXkj61eiA-_-h77heP22dYOy-WR4PSha3O_tPK57w4wZ4jIXII",
"https://play-lh.googleusercontent.com/6qUR3CmTyz3lMdMK8GENfibQ9ZQIIgHIP3_pgnYcuG04ykheKtl-dhyPzjlvhF_MANI",
result["icon"],
)
self.assertTrue(result["screenshots"])
Expand All @@ -43,7 +43,7 @@ def test_e2e_scenario_2(self):
"""
Test for different language and country.
"""
results = search("Bestes Pikachu Spiel", lang="de", country="de")
results = search("Uber", lang="es", country="cl")

self.assertGreater(len(results), 0)

Expand Down
Loading