forked from Smerity/cc-mrjob
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mrjob.conf
28 lines (26 loc) · 1.13 KB
/
mrjob.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
runners:
emr:
aws_region: us-west-1
# Either set the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
# or set the two variables below
#aws_access_key_id: ...
#aws_secret_access_key: ...
# For more control, it's highly recommended to add your key pair
#ec2_key_pair: ...
#ec2_key_pair_file: ...
#ssh_tunnel_to_job_tracker: true
ec2_instance_type: m1.xlarge
ec2_master_instance_type: m1.xlarge
ec2_master_instance_bid_price: '0.1'
ec2_core_instance_bid_price: '0.1'
num_ec2_instances: 2
# EMR comes with Python 2.6 by default -- installing Python 2.7 takes a while but might be necessary
# We also install packages needed for streaming compressed files from S3 or reading WARC files
# There's a newer AMI version but it has issues with the released stable mrjob
ami_version: 3.0.4
interpreter: python2.7
bootstrap:
- sudo yum install -y python27 python27-devel gcc-c++
- sudo python2.7 get-pip.py#
- sudo pip2.7 install boto mrjob simplejson warc phonenumbers
- sudo pip2.7 install https://github.com/commoncrawl/gzipstream/archive/master.zip