#Import SparkContext from pyspark from pyspark import SparkContext sc = SparkContext() from operator import add rdd1 =…

#Import SparkContext from pyspark

from pyspark import SparkContext

sc = SparkContext()

from operator import add

rdd1 = sc.parallelize([(“a”,1),(“b”,1),(“a”,1)])

sorted(rdd1.reduceByKey(add).collect())

!curl -L https://github.com/fivethirtyeight/data/blob/master/daily-show-guests/daily_show_guests.csv -o daily.csv

!head -10 daily.csv

raw = sc.textFile(“daily.csv”)

raw.take(5)

daily = raw.map(lambda line: line.split(‘,’))

daily.take(5)

# Aggregate total count of visitors per year

tally = daily.map(lambda x: (x[0], 1))

       .reduceByKey(lambda x,y: x+y)

print(tally)

# because Spark is lazy we need to perform an action on the RDD

tally.take(tally.count())

Questions: How do I sort the tally by year

Leave a Reply