Skip to content

Commit

Permalink
Handle failed challenge submissions, update README, remove PyExecJS d…
Browse files Browse the repository at this point in the history
…ependency
  • Loading branch information
Anorov committed Feb 5, 2018
1 parent 8bf4daf commit d78325c
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 33 deletions.
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@ For reference, this is the default message Cloudflare uses for these sorts of pa

Any script using cloudflare-scrape will sleep for 5 seconds for the first visit to any site with Cloudflare anti-bots enabled, though no delay will occur after the first request.

Warning
======

**Due to a critical security vulnerability, if you are running any version below 1.9 please upgrade to version 1.9 or higher immediately.** Versions before 1.9.0 used unsafe Javascript execution mechanisms which could result in arbitrary code execution. If you are running a vulnerable version, a malicious website owner could craft a page which executes arbitrary code on the machine that runs this script. This can only occur if the website that the user attempts to scrape has specifically prepared a page to exploit vulnerable versions of cfscrape.

Installation
============

Expand Down Expand Up @@ -88,6 +83,16 @@ scraper = cfscrape.create_scraper(sess=session)

Unfortunately, not all of Requests' session attributes are easily transferable, so if you run into problems with this, you should replace your initial `sess = requests.session()` call with `sess = cfscrape.create_scraper()`.

### Delays

Normally, when a browser is faced with a Cloudflare IUAM challenge page, Cloudflare requires the browser to wait 5 seconds before submitting the challenge answer. If a website is under heavy load, sometimes this may fail. One solution is to increase the delay (perhaps to 10 or 15 seconds, depending on the website). If you would like to override this delay, pass the `delay` keyword argument to `create_scraper()` or `CloudflareScraper()`.

There is no need to override this delay unless cloudflare-scrape is generates an error recommending you increase the delay.

```python
scraper = cfscrape.create_scraper(delay=10)
```

## Integration

It's easy to integrate cloudflare-scrape with other applications and tools. Cloudflare uses two cookies as tokens: one to verify you made it past their challenge page and one to track your session. To bypass the challenge page, simply include both of these cookies (with the appropriate user-agent) in all HTTP requests you make.
Expand Down
70 changes: 43 additions & 27 deletions cfscrape/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import logging
import random
import re
from requests.sessions import Session
import subprocess
from copy import deepcopy
from time import sleep

import execjs
from requests.sessions import Session

try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse

__version__ = "1.9.3"
__version__ = "1.9.4"

DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
Expand All @@ -24,32 +24,49 @@

DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)

BUG_REPORT = ("Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a "
"bug report at https://github.com/Anorov/cloudflare-scrape/issues.")
BUG_REPORT = """\
Cloudflare may have changed their technique, or there may be a bug in the script.
Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \
bug report at https://github.com/Anorov/cloudflare-scrape/issues."\
"""

ANSWER_ACCEPT_ERROR = """\
The challenge answer was not properly accepted by Cloudflare. This can occur if \
the target website is under heavy load, or if Cloudflare is experiencing issues. You can
potentially resolve this by increasing the challenge answer delay (default: 5 seconds). \
For example: cfscrape.create_scraper(delay=10)
If increasing the delay does not help, please open a GitHub issue at \
https://github.com/Anorov/cloudflare-scrape/issues\
"""

class CloudflareScraper(Session):
def __init__(self, *args, **kwargs):
self.delay = kwargs.pop('delay', 5)

self.delay = kwargs.pop("delay", 5)
super(CloudflareScraper, self).__init__(*args, **kwargs)

if "requests" in self.headers["User-Agent"]:
# Spoof Firefox on Linux if no custom User-Agent has been set
# Set a random User-Agent if no custom User-Agent has been set
self.headers["User-Agent"] = DEFAULT_USER_AGENT

def is_cloudflare_challenge(self, resp):
return (
resp.status_code == 503
and resp.headers.get("Server", "").startswith("cloudflare")
and b"jschl_vc" in resp.content
and b"jschl_answer" in resp.content
)

def request(self, method, url, *args, **kwargs):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)

# Check if Cloudflare anti-bot is on
if ( resp.status_code == 503
and resp.headers.get("Server", "").startswith("cloudflare")
and b"jschl_vc" in resp.content
and b"jschl_answer" in resp.content
):
return self.solve_cf_challenge(resp, **kwargs)

# Otherwise, no Cloudflare anti-bot detected
if self.is_cloudflare_challenge(resp):
resp = self.solve_cf_challenge(resp, **kwargs)
if self.is_cloudflare_challenge(resp):
raise ValueError(ANSWER_ACCEPT_ERROR)

return resp

def solve_cf_challenge(self, resp, **original_kwargs):
Expand Down Expand Up @@ -111,33 +128,32 @@ def solve_challenge(self, body):

# Use vm.runInNewContext to safely evaluate code
# The sandboxed code cannot use the Node.js standard library
js = "return require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000});" % js
js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js

try:
node = execjs.get("Node")
except Exception:
raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")

try:
result = node.exec_(js)
result = subprocess.check_output(["node", "-e", js]).strip()
except OSError as e:
if e.errno == 2:
raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
raise
except Exception:
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
raise

try:
result = int(result)
except Exception:
raise ValueError("Cloudflare IUAM challenge returned unexpected value. %s" % BUG_REPORT)
raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT)

return result

@classmethod
def create_scraper(cls, sess=None, **kwargs):
"""
Convenience function for creating a ready-to-go requests.Session (subclass) object.
Convenience function for creating a ready-to-go CloudflareScraper object.
"""
scraper = cls()
scraper = cls(**kwargs)

if sess:
attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]
Expand Down
10 changes: 9 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
import os
import re
from setuptools import setup

base_path = os.path.dirname(__file__)

with open(os.path.join(base_path, 'cfscrape', '__init__.py')) as fp:
VERSION = re.compile(r'.*__version__ = "(.*?)"',
re.S).match(fp.read()).group(1)

setup(
name = 'cfscrape',
packages = ['cfscrape'],
version = '1.9.3',
version = VERSION,
description = 'A simple Python module to bypass Cloudflare\'s anti-bot page. See https://github.com/Anorov/cloudflare-scrape for more information.',
author = 'Anorov',
author_email = '[email protected]',
Expand Down

0 comments on commit d78325c

Please sign in to comment.