-
Notifications
You must be signed in to change notification settings - Fork 0
/
RDD_teach_script.py
36 lines (30 loc) · 1.01 KB
/
RDD_teach_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from pyspark import SparkConf, SparkContext
sc = SparkContext()
nums = parallelize([1,2,3,4])
sc.textFile("file://...") # or s3n://, hdfs://
hiveCtx = HiveContxt(sc) # create an RDD from HIVE and do SQL query on the HIVE context from within Spark.
rows = hiveCtx.sql("SELECT name, age FROM users;")
# Can also create RDD from
# - JDBC
# - Cassandra
# - HBase
# - ElasticSearch
# - JSON, CSV, sequence files, object files, various compressed formats
# RDD functions for transformation:
# - map
# - flatmap
# - filter
# - distinct
# - sample
# - union, intersection, subtract, cartesian
rdd = sc.parallelize([1,2,3,4])
sqauredRDD = rdd.map(lambda x:x * x)
# RDD actions:
# - collect: take all of the RDD's results and suck them down to the driver script.
# - count: how many rows are in the RDD
# - countByValue: count unique values
# - take: only take the top n results could be for debugging
# - top: top n results could be for debugging
# - reduce:
# - etc.
# Nothing happens in driver program until an action is called.