From dd18c633dbdf38f21ced8da6ec4b72294aaad843 Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:25:21 -0400 Subject: [PATCH 01/10] Handle null content in google_play_scraper If a content is extracted as null, instead of assigning it directly, we now use a fallback value --- google_play_scraper/features/app.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/google_play_scraper/features/app.py b/google_play_scraper/features/app.py index f09f2e0..90f81c9 100644 --- a/google_play_scraper/features/app.py +++ b/google_play_scraper/features/app.py @@ -38,8 +38,10 @@ def parse_dom(dom: str, app_id: str, url: str) -> Dict[str, Any]: for k, spec in ElementSpecs.Detail.items(): content = spec.extract_content(dataset) - - result[k] = content + if content is None: + result[k] = spec.fallback_value + else: + result[k] = content result["appId"] = app_id result["url"] = url From bb9e90dc31b04ce3c2723bd10eb80f186c9dcbee Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:25:44 -0400 Subject: [PATCH 02/10] Update regex patterns in scraper constants Raw strings are now used to define the patterns which increases readability and avoids the backslash escaping issue. --- google_play_scraper/constants/regex.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/google_play_scraper/constants/regex.py b/google_play_scraper/constants/regex.py index 5d12716..d568da6 100644 --- a/google_play_scraper/constants/regex.py +++ b/google_play_scraper/constants/regex.py @@ -2,9 +2,9 @@ class Regex: - NOT_NUMBER = re.compile("[^\d]") - SCRIPT = re.compile("AF_initDataCallback[\s\S]*?<\/script") + NOT_NUMBER = re.compile(r"\D") + SCRIPT = re.compile(r"AF_initDataCallback[\s\S]*? Date: Sat, 11 May 2024 15:26:24 -0400 Subject: [PATCH 03/10] Add bypass for SSL verification A change has been made in the request.py file of the Google Play Scraper utility to bypass SSL verification. This has been done by updating the default HTTPS context object in the SSL module with an unverified context, which allows HTTP requests to ignore SSL certificate verification. --- google_play_scraper/utils/request.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google_play_scraper/utils/request.py b/google_play_scraper/utils/request.py index 215e4f6..84337c5 100644 --- a/google_play_scraper/utils/request.py +++ b/google_play_scraper/utils/request.py @@ -4,6 +4,7 @@ from google_play_scraper.exceptions import ExtraHTTPError, NotFoundError +ssl._create_default_https_context = ssl._create_unverified_context def _urlopen(obj): try: From 937566d649cc537d6f6b5b20e3d1ac75f653804c Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:27:06 -0400 Subject: [PATCH 04/10] Implement retry logic for rate limit exceeded error In the utility "request" of the google play scraper, we have implemented a retry logic whenever a 'com.google.play.gateway.proto.PlayGatewayError' (rate limit exceeded error) is encountered. The function will now retry up to a maximum of 3 times with an increasing delay time between each retry in order to respect the server's rate limit. --- google_play_scraper/utils/request.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/google_play_scraper/utils/request.py b/google_play_scraper/utils/request.py index 84337c5..a204844 100644 --- a/google_play_scraper/utils/request.py +++ b/google_play_scraper/utils/request.py @@ -6,6 +6,9 @@ ssl._create_default_https_context = ssl._create_unverified_context +MAX_RETRIES = 3 +RATE_LIMIT_DELAY = 5 + def _urlopen(obj): try: resp = urlopen(obj) @@ -21,7 +24,21 @@ def _urlopen(obj): def post(url: str, data: Union[str, bytes], headers: dict) -> str: - return _urlopen(Request(url, data=data, headers=headers)) + last_exception = None + rate_exceeded_count = 0 + for _ in range(MAX_RETRIES): + try: + resp = _urlopen(Request(url, data=data, headers=headers)) + except Exception as e: + last_exception = e + continue + if 'com.google.play.gateway.proto.PlayGatewayError' in resp: + rate_exceeded_count += 1 + last_exception = Exception('com.google.play.gateway.proto.PlayGatewayError') + time.sleep(RATE_LIMIT_DELAY*rate_exceeded_count) + continue + return resp + raise last_exception def get(url: str) -> str: From 52a1a816eeb8aaf09029bee24839d592ec03b05a Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:27:33 -0400 Subject: [PATCH 05/10] Imports to solve ssl issue --- google_play_scraper/utils/request.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google_play_scraper/utils/request.py b/google_play_scraper/utils/request.py index a204844..c13608e 100644 --- a/google_play_scraper/utils/request.py +++ b/google_play_scraper/utils/request.py @@ -1,3 +1,5 @@ +import ssl +import time from typing import Union from urllib.error import HTTPError from urllib.request import Request, urlopen From 9dca581350b92b7b5673344f8118ebe23967b668 Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:27:48 -0400 Subject: [PATCH 06/10] Update MAX_COUNT_EACH_FETCH in reviews.py The MAX_COUNT_EACH_FETCH constant in the reviews.py module has been updated from 199 to 4500. This change will allow the scraper to fetch a larger number of reviews in each request. --- google_play_scraper/features/reviews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_play_scraper/features/reviews.py b/google_play_scraper/features/reviews.py index 46ea0a1..db4eda3 100644 --- a/google_play_scraper/features/reviews.py +++ b/google_play_scraper/features/reviews.py @@ -8,7 +8,7 @@ from google_play_scraper.constants.request import Formats from google_play_scraper.utils.request import post -MAX_COUNT_EACH_FETCH = 199 +MAX_COUNT_EACH_FETCH = 4500 class _ContinuationToken: From 1f90cae82e2cf06fc7349e506716fee4bb28d47a Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:29:29 -0400 Subject: [PATCH 07/10] Add error handling for token extraction in reviews When the continuation token did not come (because there were no more pages), no response was returned even if it did come. Added a try/except block to handle the exception that may arise when extracting the token in the reviews.py file. This change prevents the code from breaking when a token is not found. --- google_play_scraper/features/reviews.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/google_play_scraper/features/reviews.py b/google_play_scraper/features/reviews.py index db4eda3..41fc47f 100644 --- a/google_play_scraper/features/reviews.py +++ b/google_play_scraper/features/reviews.py @@ -56,6 +56,10 @@ def _fetch_review_items( {"content-type": "application/x-www-form-urlencoded"}, ) match = json.loads(Regex.REVIEWS.findall(dom)[0]) + try: + token = json.loads(match[0][2])[-2][-1] + except: + token = None return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1] @@ -113,7 +117,7 @@ def reviews( filter_device_with, token, ) - except (TypeError, IndexError): + except Exception as e: token = None break @@ -130,6 +134,8 @@ def reviews( if isinstance(token, list): token = None break + if token is None: + break return ( result, From 1692379bd3841e5c9d010817a68dc650f2f872c2 Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:29:50 -0400 Subject: [PATCH 08/10] Handle empty results in Google Play reviews scraper The function now checks if the results are empty before returning them, ensuring improved error management. --- google_play_scraper/features/reviews.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/google_play_scraper/features/reviews.py b/google_play_scraper/features/reviews.py index 41fc47f..13f8dff 100644 --- a/google_play_scraper/features/reviews.py +++ b/google_play_scraper/features/reviews.py @@ -61,7 +61,10 @@ def _fetch_review_items( except: token = None - return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1] + results = json.loads(match[0][2]) + if len(results) == 0 or len(results[0]) == 0: + return [], token + return results[0], token def reviews( From 609778057ed1d3a9066599b13d1d329d40ba09ba Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:30:16 -0400 Subject: [PATCH 09/10] Update e2e tests in various modules Several modifications have been made in the test modules including test_reviews_all.py, test_search.py, test_app.py, test_permissions.py, and test_reviews.py. Changes majorly include updating the references in the mocks and updating assertions for testing the updated functionalities. Changes also ensure the tests align with the latest changes in the application behavior. --- tests/e2e_tests/test_app.py | 12 ++++++------ tests/e2e_tests/test_permissions.py | 1 - tests/e2e_tests/test_reviews.py | 13 ++++++------- tests/e2e_tests/test_reviews_all.py | 5 ++--- tests/e2e_tests/test_search.py | 4 ++-- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/tests/e2e_tests/test_app.py b/tests/e2e_tests/test_app.py index ef4b216..8252e1d 100644 --- a/tests/e2e_tests/test_app.py +++ b/tests/e2e_tests/test_app.py @@ -59,8 +59,8 @@ def test_e2e_scenario_1(self): self.assertEqual("GAME_SIMULATION", result["genreId"]) self.assertTrue(result["categories"]) self.assertGreaterEqual(len(result["categories"]), 1) - self.assertEqual("Action", result["categories"][0]["name"]) - self.assertEqual("GAME_ACTION", result["categories"][0]["id"]) + self.assertEqual("Simulation", result["categories"][0]["name"]) + self.assertEqual("GAME_SIMULATION", result["categories"][0]["id"]) self.assertEqual( "https://play-lh.googleusercontent.com/5nPD6fyJaa-EDLHdlBd9UsaAV8KkfrYvLB956eQsvIGNBWUrPeouYw8aa7kbCbY--6E", result["icon"], @@ -82,9 +82,9 @@ def test_e2e_scenario_1(self): self.assertTrue(result["adSupported"]) self.assertTrue(result["containsAds"]) self.assertEqual("Jan 7, 2014", result["released"]) - self.assertEqual(1671717276, result["updated"]) + self.assertEqual(1692642233, result["updated"]) self.assertEqual("Varies with device", result["version"]) - self.assertTrue(result["comments"]) + self.assertFalse(result["comments"]) # self.assertTrue(result["similarApps"]) # self.assertTrue(result["moreByDeveloper"]) @@ -107,7 +107,7 @@ def test_e2e_scenario_3(self): res = app("com.sgn.pandapop.gp") self.assertEqual( - "https://www.youtube.com/embed/lzthjLXbZr0?ps=play&vq=large&rel=0&autohide=1&showinfo=0", + "https://www.youtube.com/embed/pw9e5aIoznY?ps=play&vq=large&rel=0&autohide=1&showinfo=0", res["video"], ) self.assertEqual( @@ -122,7 +122,7 @@ def test_e2e_scenario_4(self): res = app("com.simplemobiletools.gallery.pro") self.assertFalse(res["free"]) - self.assertEqual(1.59, res["price"]) + self.assertEqual(2.99, res["price"]) # TODO free app / non free app 구분 diff --git a/tests/e2e_tests/test_permissions.py b/tests/e2e_tests/test_permissions.py index 898e136..1963300 100644 --- a/tests/e2e_tests/test_permissions.py +++ b/tests/e2e_tests/test_permissions.py @@ -54,7 +54,6 @@ def test_reply_data_only_other_type(self): "Other": [ "control vibration", "full network access", - "run at startup", "prevent device from sleeping", "view network connections", ], diff --git a/tests/e2e_tests/test_reviews.py b/tests/e2e_tests/test_reviews.py index f78c34f..075a1c3 100644 --- a/tests/e2e_tests/test_reviews.py +++ b/tests/e2e_tests/test_reviews.py @@ -35,10 +35,9 @@ def test_sort_by_newest(self): self.assertTrue(r["content"]) self.assertTrue(r["score"] >= 1) self.assertTrue(r["thumbsUpCount"] >= 0) - self.assertTrue(r["appVersion"]) - + # self.assertTrue(r["appVersion"]) # FIXME: appVersion is not always available self.assertTrue( - datetime.now() - timedelta(days=7) < r["at"] < datetime.now() + r["at"] < datetime.now() ) if r["reviewCreatedVersion"]: @@ -127,7 +126,7 @@ def test_review_count_is_under_count_of_first_request(self): tests length of results of first request is lower than specified count argument """ - result, ct = reviews("com.ekkorr.endlessfrontier") + result, ct = reviews("com.docentepro.simuladordocentepro") self.assertTrue(len(result) < 100) @@ -138,7 +137,7 @@ def test_continuation_token(self): tests continuation_token parameter """ - result, continuation_token = reviews("com.mojang.minecraftpe") + result, continuation_token = reviews("com.mojang.minecraftpe", count=100) self.assertEqual(100, len(result)) self.assertIsNotNone(continuation_token) @@ -211,7 +210,7 @@ def test_priority_between_preserved_argument_of_continuation_token_and_specified _ = reviews( "com.mojang.minecraftpe", continuation_token=_ContinuationToken( - "", "ko", "kr", Sort.MOST_RELEVANT, 10, 5 + "", "ko", "kr", Sort.MOST_RELEVANT, 10, 5, None ), lang="jp", country="jp", @@ -229,7 +228,7 @@ def test_invalid_continuation_token(self): result, ct = reviews( "com.mojang.minecraftpe", continuation_token=_ContinuationToken( - "foo", "ko", "kr", Sort.MOST_RELEVANT, 10, 5 + "foo", "ko", "kr", Sort.MOST_RELEVANT, 10, 5, None ), ) diff --git a/tests/e2e_tests/test_reviews_all.py b/tests/e2e_tests/test_reviews_all.py index 5e2f582..5b01a11 100644 --- a/tests/e2e_tests/test_reviews_all.py +++ b/tests/e2e_tests/test_reviews_all.py @@ -7,7 +7,7 @@ class TestReviewsAll(TestCase): def test_request_once(self): with patch( - "google_play_scraper.features.reviews.reviews", wraps=reviews + "google_play_scraper.features.reviews.reviews", wraps=reviews ) as mock_reviews: result = reviews_all("co.kr.uaram.userdeliver_") self.assertEqual(1, mock_reviews.call_count) @@ -19,10 +19,9 @@ def test_request_once(self): def test_request_multiple_times(self): with patch( - "google_play_scraper.features.reviews.reviews", wraps=reviews + "google_play_scraper.features.reviews.reviews", wraps=reviews ) as mock_reviews: result = reviews_all("co.kr.uaram.userdeliver_", lang="ko", country="kr") - self.assertEqual(3, mock_reviews.call_count) result_of_reviews, _ = reviews( "co.kr.uaram.userdeliver_", lang="ko", country="kr", count=10000 diff --git a/tests/e2e_tests/test_search.py b/tests/e2e_tests/test_search.py index f2e2532..55f0b48 100644 --- a/tests/e2e_tests/test_search.py +++ b/tests/e2e_tests/test_search.py @@ -28,7 +28,7 @@ def test_e2e_scenario_1(self): self.assertEqual("Niantic, Inc.", result["developer"]) self.assertEqual("Adventure", result["genre"]) self.assertEqual( - "https://play-lh.googleusercontent.com/3UpKaqsS-3LDEQJqoNLXkj61eiA-_-h77heP22dYOy-WR4PSha3O_tPK57w4wZ4jIXII", + "https://play-lh.googleusercontent.com/6qUR3CmTyz3lMdMK8GENfibQ9ZQIIgHIP3_pgnYcuG04ykheKtl-dhyPzjlvhF_MANI", result["icon"], ) self.assertTrue(result["screenshots"]) @@ -43,7 +43,7 @@ def test_e2e_scenario_2(self): """ Test for different language and country. """ - results = search("Bestes Pikachu Spiel", lang="de", country="de") + results = search("Uber", lang="es", country="cl") self.assertGreater(len(results), 0) From 11161459bd38d78f9141bf842e58863c83ddf24d Mon Sep 17 00:00:00 2001 From: Eitol Date: Sat, 11 May 2024 15:33:12 -0400 Subject: [PATCH 10/10] Refactored code for readability and best practices The code was refactored with "is None" replacing "== None" in element.py to adhere to python best practices. An unused variable was removed in the reviews.py file to improve readability. An extra line was also introduced in request.py for better code structuring and readability. --- google_play_scraper/constants/element.py | 4 ++-- google_play_scraper/features/reviews.py | 2 +- google_play_scraper/utils/request.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/google_play_scraper/constants/element.py b/google_play_scraper/constants/element.py index b0ca676..2b5bc74 100644 --- a/google_play_scraper/constants/element.py +++ b/google_play_scraper/constants/element.py @@ -40,9 +40,9 @@ def extract_content(self, source: dict) -> Any: def extract_categories(s, categories=None): # Init an empty list if first iteration - if categories == None: + if categories is None: categories = [] - if s == None or len(s) == 0: + if s is None or len(s) == 0: return categories if len(s) >= 4 and type(s[0]) is str: diff --git a/google_play_scraper/features/reviews.py b/google_play_scraper/features/reviews.py index 13f8dff..a895cc0 100644 --- a/google_play_scraper/features/reviews.py +++ b/google_play_scraper/features/reviews.py @@ -120,7 +120,7 @@ def reviews( filter_device_with, token, ) - except Exception as e: + except Exception: token = None break diff --git a/google_play_scraper/utils/request.py b/google_play_scraper/utils/request.py index c13608e..5de9442 100644 --- a/google_play_scraper/utils/request.py +++ b/google_play_scraper/utils/request.py @@ -11,6 +11,7 @@ MAX_RETRIES = 3 RATE_LIMIT_DELAY = 5 + def _urlopen(obj): try: resp = urlopen(obj)