diff --git a/cfscrape/__init__.py b/cfscrape/__init__.py index 38debd4..841e427 100644 --- a/cfscrape/__init__.py +++ b/cfscrape/__init__.py @@ -96,6 +96,7 @@ def __init__(self, *args, **kwargs): # Define headers to force using an OrderedDict and preserve header order self.headers = headers + self.org_method = None self.mount("https://", CloudflareAdapter()) @@ -152,7 +153,13 @@ def solve_cf_challenge(self, resp, **original_kwargs): body = resp.text parsed_url = urlparse(resp.url) domain = parsed_url.netloc - submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) + challenge_form = re.search(r'\',body, flags=re.S).group(0) # find challenge form + method = re.search(r'method=\"(.*?)\"', challenge_form, flags=re.S).group(1) + if self.org_method is None: + self.org_method = resp.request.method + submit_url = "%s://%s%s" % (parsed_url.scheme, + domain, + re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[0]) cloudflare_kwargs = copy.deepcopy(original_kwargs) @@ -160,13 +167,29 @@ def solve_cf_challenge(self, resp, **original_kwargs): headers["Referer"] = resp.url try: - params = cloudflare_kwargs["params"] = OrderedDict( - re.findall(r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body) - ) + cloudflare_kwargs["params"] = dict() + cloudflare_kwargs["data"] = dict() + if len(re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')) != 1: + for param in re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[1].split('&'): + cloudflare_kwargs["params"].update({param.split('=')[0]:param.split('=')[1]}) + + for input_ in re.findall(r'\|\<\/input\>)', challenge_form, flags=re.S): + if re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1) != 'jschl_answer': + if method == 'POST': + cloudflare_kwargs["data"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1): + re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)}) + elif method == 'GET': + cloudflare_kwargs["params"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1): + re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)}) + if method == 'POST': + for k in ("jschl_vc", "pass"): + if k not in cloudflare_kwargs["data"]: + raise ValueError("%s is missing from challenge form" % k) + elif method == 'GET': + for k in ("jschl_vc", "pass"): + if k not in cloudflare_kwargs["params"]: + raise ValueError("%s is missing from challenge form" % k) - for k in ("jschl_vc", "pass"): - if k not in params: - raise ValueError("%s is missing from challenge form" % k) except Exception as e: # Something is wrong with the page. # This may indicate Cloudflare has changed their anti-bot @@ -179,12 +202,14 @@ def solve_cf_challenge(self, resp, **original_kwargs): # Solve the Javascript challenge answer, delay = self.solve_challenge(body, domain) - params["jschl_answer"] = answer + if method == 'POST': + cloudflare_kwargs["data"]["jschl_answer"] = answer + elif method == 'GET': + cloudflare_kwargs["params"]["jschl_answer"] = answer # Requests transforms any request into a GET after a redirect, # so the redirect has to be handled manually here to allow for # performing other types of requests even as the first request. - method = resp.request.method cloudflare_kwargs["allow_redirects"] = False # Cloudflare requires a delay before solving the challenge @@ -192,42 +217,56 @@ def solve_cf_challenge(self, resp, **original_kwargs): # Send the challenge response and handle the redirect manually redirect = self.request(method, submit_url, **cloudflare_kwargs) - redirect_location = urlparse(redirect.headers["Location"]) - - if not redirect_location.netloc: - redirect_url = urlunparse( - ( - parsed_url.scheme, - domain, - redirect_location.path, - redirect_location.params, - redirect_location.query, - redirect_location.fragment, + if "Location" in redirect.headers: + redirect_location = urlparse(redirect.headers["Location"]) + + if not redirect_location.netloc: + redirect_url = urlunparse( + ( + parsed_url.scheme, + domain, + redirect_location.path, + redirect_location.params, + redirect_location.query, + redirect_location.fragment, + ) ) - ) - return self.request(method, redirect_url, **original_kwargs) - return self.request(method, redirect.headers["Location"], **original_kwargs) + return self.request(method, redirect_url, **original_kwargs) + return self.request(method, redirect.headers["Location"], **original_kwargs) + elif "Set-Cookie" in redirect.headers: + if 'cf_clearance' in redirect.headers['Set-Cookie']: + resp = self.request(self.org_method, submit_url, cookies = redirect.cookies) + return resp + else: + return self.request(method, redirect_url, **original_kwargs) + else: + resp = self.request(self.org_method, submit_url, **cloudflare_kwargs) + return resp + def solve_challenge(self, body, domain): try: + javascript = re.search(r'\ + + + + + + + + + +
+
+ + + +
+ + + + +
+ +
+ + +
+ DDoS protection by Cloudflare +
+ Ray ID: 53d393f93ae1c82f +
+
+ + \ No newline at end of file