From d78325caf015f87eebd5c162bc9ed6797806a4d2 Mon Sep 17 00:00:00 2001 From: Anorov Date: Mon, 5 Feb 2018 14:51:21 -0500 Subject: [PATCH] Handle failed challenge submissions, update README, remove PyExecJS dependency --- README.md | 15 ++++++---- cfscrape/__init__.py | 70 +++++++++++++++++++++++++++----------------- setup.py | 10 ++++++- 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index a7cefb1..486eb5e 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,6 @@ For reference, this is the default message Cloudflare uses for these sorts of pa Any script using cloudflare-scrape will sleep for 5 seconds for the first visit to any site with Cloudflare anti-bots enabled, though no delay will occur after the first request. -Warning -====== - -**Due to a critical security vulnerability, if you are running any version below 1.9 please upgrade to version 1.9 or higher immediately.** Versions before 1.9.0 used unsafe Javascript execution mechanisms which could result in arbitrary code execution. If you are running a vulnerable version, a malicious website owner could craft a page which executes arbitrary code on the machine that runs this script. This can only occur if the website that the user attempts to scrape has specifically prepared a page to exploit vulnerable versions of cfscrape. - Installation ============ @@ -88,6 +83,16 @@ scraper = cfscrape.create_scraper(sess=session) Unfortunately, not all of Requests' session attributes are easily transferable, so if you run into problems with this, you should replace your initial `sess = requests.session()` call with `sess = cfscrape.create_scraper()`. +### Delays + +Normally, when a browser is faced with a Cloudflare IUAM challenge page, Cloudflare requires the browser to wait 5 seconds before submitting the challenge answer. If a website is under heavy load, sometimes this may fail. One solution is to increase the delay (perhaps to 10 or 15 seconds, depending on the website). If you would like to override this delay, pass the `delay` keyword argument to `create_scraper()` or `CloudflareScraper()`. + +There is no need to override this delay unless cloudflare-scrape is generates an error recommending you increase the delay. + +```python +scraper = cfscrape.create_scraper(delay=10) +``` + ## Integration It's easy to integrate cloudflare-scrape with other applications and tools. Cloudflare uses two cookies as tokens: one to verify you made it past their challenge page and one to track your session. To bypass the challenge page, simply include both of these cookies (with the appropriate user-agent) in all HTTP requests you make. diff --git a/cfscrape/__init__.py b/cfscrape/__init__.py index b341e18..3a81115 100644 --- a/cfscrape/__init__.py +++ b/cfscrape/__init__.py @@ -1,18 +1,18 @@ import logging import random import re -from requests.sessions import Session +import subprocess from copy import deepcopy from time import sleep -import execjs +from requests.sessions import Session try: from urlparse import urlparse except ImportError: from urllib.parse import urlparse -__version__ = "1.9.3" +__version__ = "1.9.4" DEFAULT_USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", @@ -24,32 +24,49 @@ DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS) -BUG_REPORT = ("Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a " -"bug report at https://github.com/Anorov/cloudflare-scrape/issues.") +BUG_REPORT = """\ +Cloudflare may have changed their technique, or there may be a bug in the script. + +Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \ +bug report at https://github.com/Anorov/cloudflare-scrape/issues."\ +""" +ANSWER_ACCEPT_ERROR = """\ +The challenge answer was not properly accepted by Cloudflare. This can occur if \ +the target website is under heavy load, or if Cloudflare is experiencing issues. You can +potentially resolve this by increasing the challenge answer delay (default: 5 seconds). \ +For example: cfscrape.create_scraper(delay=10) + +If increasing the delay does not help, please open a GitHub issue at \ +https://github.com/Anorov/cloudflare-scrape/issues\ +""" class CloudflareScraper(Session): def __init__(self, *args, **kwargs): - self.delay = kwargs.pop('delay', 5) - + self.delay = kwargs.pop("delay", 5) super(CloudflareScraper, self).__init__(*args, **kwargs) if "requests" in self.headers["User-Agent"]: - # Spoof Firefox on Linux if no custom User-Agent has been set + # Set a random User-Agent if no custom User-Agent has been set self.headers["User-Agent"] = DEFAULT_USER_AGENT + def is_cloudflare_challenge(self, resp): + return ( + resp.status_code == 503 + and resp.headers.get("Server", "").startswith("cloudflare") + and b"jschl_vc" in resp.content + and b"jschl_answer" in resp.content + ) + def request(self, method, url, *args, **kwargs): resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) # Check if Cloudflare anti-bot is on - if ( resp.status_code == 503 - and resp.headers.get("Server", "").startswith("cloudflare") - and b"jschl_vc" in resp.content - and b"jschl_answer" in resp.content - ): - return self.solve_cf_challenge(resp, **kwargs) - - # Otherwise, no Cloudflare anti-bot detected + if self.is_cloudflare_challenge(resp): + resp = self.solve_cf_challenge(resp, **kwargs) + if self.is_cloudflare_challenge(resp): + raise ValueError(ANSWER_ACCEPT_ERROR) + return resp def solve_cf_challenge(self, resp, **original_kwargs): @@ -111,16 +128,15 @@ def solve_challenge(self, body): # Use vm.runInNewContext to safely evaluate code # The sandboxed code cannot use the Node.js standard library - js = "return require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000});" % js + js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js try: - node = execjs.get("Node") - except Exception: - raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape" - " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.") - - try: - result = node.exec_(js) + result = subprocess.check_output(["node", "-e", js]).strip() + except OSError as e: + if e.errno == 2: + raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape" + " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.") + raise except Exception: logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT) raise @@ -128,16 +144,16 @@ def solve_challenge(self, body): try: result = int(result) except Exception: - raise ValueError("Cloudflare IUAM challenge returned unexpected value. %s" % BUG_REPORT) + raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT) return result @classmethod def create_scraper(cls, sess=None, **kwargs): """ - Convenience function for creating a ready-to-go requests.Session (subclass) object. + Convenience function for creating a ready-to-go CloudflareScraper object. """ - scraper = cls() + scraper = cls(**kwargs) if sess: attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"] diff --git a/setup.py b/setup.py index 42e904e..d225268 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,17 @@ +import os +import re from setuptools import setup +base_path = os.path.dirname(__file__) + +with open(os.path.join(base_path, 'cfscrape', '__init__.py')) as fp: + VERSION = re.compile(r'.*__version__ = "(.*?)"', + re.S).match(fp.read()).group(1) + setup( name = 'cfscrape', packages = ['cfscrape'], - version = '1.9.3', + version = VERSION, description = 'A simple Python module to bypass Cloudflare\'s anti-bot page. See https://github.com/Anorov/cloudflare-scrape for more information.', author = 'Anorov', author_email = 'anorov.vorona@gmail.com',