WordCount_new.json

﻿{"paragraphs":[{"text":"%spark.pyspark\nimport sys, re\nimport nltk","user":"anonymous","dateUpdated":"2018-09-17T18:49:56+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1536700724399_647054160","id":"20180910-005353_412387350","dateCreated":"2018-09-11T21:18:44+0000","dateStarted":"2018-09-17T18:49:56+0000","dateFinished":"2018-09-17T18:50:42+0000","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:232"},{"text":"%spark.pyspark\n\nimport os\n#os.system('cd bin')\nos.system('ls /')\n","user":"anonymous","dateUpdated":"2018-09-11T22:58:08+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"archive_spark\nbin\nboot\ncore\ndev\netc\nhome\nlib\nlib64\nlogs\nmedia\nmnt\nmy_files\nnotebook\nopt\nproc\nroot\nrun\nsbin\nshare_dir\nsrv\nsys\ntmp\nusr\nvar\nzeppelin\nzeppelin_docker\n0\n"}]},"apps":[],"jobName":"paragraph_1536701003758_-619370302","id":"20180911-212323_1702642604","dateCreated":"2018-09-11T21:23:23+0000","dateStarted":"2018-09-11T22:58:08+0000","dateFinished":"2018-09-11T22:58:08+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:233"},{"text":"%spark.pyspark\n#demo_data = sqlContext.read.json(\"/home/liuqing/Documents/DLRL/DLRL_Courses/CS4984_f18_TextSummarization/PySpark_Demo/Demo.json\")\ndemo_data = sqlContext.read.json(\"/share_dir/Demos/Current.json\")\n#demo_data = sqlContext.read.json(\"file:///~/docker/Demos/Demo.json\")\ndemo_data.show()","user":"anonymous","dateUpdated":"2018-09-17T18:50:04+0000","config":{"tableHide":false,"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"+--------------------+--------------+--------------------+\n|         Sentences_t|   Timestamp_s|               URL_s|\n+--------------------+--------------+--------------------+\n|-- It's Primary E...|20180819002456|https://wtvr.com/...|\n|In a race that pr...|20180819002419|https://www.washi...|\n|Less than half of...|20180819002416|https://www.nbcwa...|\n|Midterms Matter. ...|20180819002641|http://bluevirgin...|\n|Party leaders sai...|20180819002707|http://www.washin...|\n|Virginia Politics...|20180819002537|https://www.washi...|\n|In a race that pr...|20180819002420|https://www.washi...|\n|Midterms Matter. ...|20180819002554|http://bluevirgin...|\n|But state issues ...|20180819002412|https://www.natio...|\n|Let America Vote,...|20180819002530|http://www.washin...|\n|We expose the met...|20180819002356|https://m.washing...|\n|Midterms Matter. ...|20180819002621|http://bluevirgin...|\n|Colgan (D-Prince ...|20180819002424|https://www.washi...|\n|April 17, 2018 1:...|20180819002506|https://www.peopl...|\n|A physician and t...|20180819002536|https://www.washi...|\n|The GOP has lost ...|20180819002533|https://www.washi...|\n|along state senat...|20180819002422|https://www.washi...|\n|Now, because Boll...|20180819002527|http://www.washin...|\n|Midterms Matter. ...|20180819002654|http://bluevirgin...|\n|In a race that pr...|20180819002419|https://www.washi...|\n+--------------------+--------------+--------------------+\nonly showing top 20 rows\n\n"}]},"apps":[],"jobName":"paragraph_1536700724490_1150414433","id":"20180826-163143_698248606","dateCreated":"2018-09-11T21:18:44+0000","dateStarted":"2018-09-17T18:50:05+0000","dateFinished":"2018-09-17T18:51:01+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:234"},{"text":"%spark.pyspark\n\n# only select the column \"Sentences_t\" and convert Dataframe into basic RDD\nsentences = demo_data.select('Sentences_t').rdd","user":"anonymous","dateUpdated":"2018-09-17T18:52:15+0000","config":{"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1536700724504_846687646","id":"20180910-010728_1977717666","dateCreated":"2018-09-11T21:18:44+0000","dateStarted":"2018-09-17T18:52:15+0000","dateFinished":"2018-09-17T18:52:15+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:235"},{"text":"%spark.pyspark\nnltk.download('stopwords')\nstop_words = nltk.corpus.stopwords.words('english')\n\n# remove blank lines\nsentences_remove_null = sentences.filter(lambda x: len(x) > 0).collect()\n\n# core process for wordcount\nwordcount = sc.parallelize(sentences_remove_null).flatMap(lambda line: re.split('\\W+', line[0])).filter(lambda word: word.lower() not in stop_words).filter(lambda word: len(word) > 0).map(lambda word: (word.lower(), 1)).reduceByKey(lambda v1, v2: v1 + v2).map(lambda x: (x[1], x[0])).sortByKey(ascending = False)\ntype(sc)\n\nwordcount.take(50)","user":"anonymous","dateUpdated":"2018-09-17T18:52:54+0000","config":{"lineNumbers":false,"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"colWidth":12,"editorMode":"ace/mode/python","fontSize":9,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"[nltk_data] Downloading package stopwords to /root/nltk_data...\n[nltk_data]   Package stopwords is already up-to-date!\n[(1979, u'virginia'), (1088, u'state'), (1036, u'democrats'), (860, u'republicans'), (834, u'house'), (806, u'election'), (800, u'republican'), (759, u'senate'), (635, u'democratic'), (590, u'district'), (535, u'said'), (518, u'vote'), (508, u'would'), (504, u'year'), (503, u'voters'), (497, u'mcauliffe'), (495, u'one'), (449, u'race'), (445, u'elections'), (436, u'control'), (432, u'governor'), (429, u'party'), (408, u'races'), (402, u'matter'), (401, u'midterms'), (381, u'two'), (378, u'northam'), (361, u'county'), (349, u'democrat'), (338, u'seat'), (337, u'general'), (334, u'also'), (321, u'day'), (313, u'campaign'), (299, u'news'), (289, u'va'), (289, u'tuesday'), (287, u'could'), (286, u'delegates'), (286, u'seats'), (284, u'r'), (283, u'brown'), (271, u'even'), (265, u'gop'), (265, u'candidates'), (264, u'last'), (262, u'new'), (256, u'votes'), (250, u'politics'), (246, u'first')]\n"}]},"apps":[],"jobName":"paragraph_1536700724511_98969776","id":"20180910-004006_645553182","dateCreated":"2018-09-11T21:18:44+0000","dateStarted":"2018-09-17T18:52:55+0000","dateFinished":"2018-09-17T18:52:57+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:236"},{"text":"%spark.pyspark\ntype(sc)\n","user":"anonymous","dateUpdated":"2018-09-12T20:31:25+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{}},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"<class 'pyspark.context.SparkContext'>\n"}]},"apps":[],"jobName":"paragraph_1536706591003_-2044504359","id":"20180911-225631_1850591456","dateCreated":"2018-09-11T22:56:31+0000","dateStarted":"2018-09-12T20:31:25+0000","dateFinished":"2018-09-12T20:31:25+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:237"},{"text":"%spark.pyspark\r\nfrom pyspark.mllib.feature import HashingTF, IDF\r\nhashingTF = HashingTF()\r\ntf = hashingTF.transform(sentences_remove_null)\r\n\r\n# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:\r\n# First to compute the IDF vector and second to scale the term frequencies by IDF.\r\nidf = IDF().fit(tf)\r\ntfidf = idf.transform(tf)\r\n\r\n# spark.mllib's IDF implementation provides an option for ignoring terms\r\n# which occur in less than a minimum number of documents.\r\n# In such cases, the IDF for these terms is set to 0.\r\n# This feature can be used by passing the minDocFreq value to the IDF constructor.\r\nidfIgnore = IDF(minDocFreq=2).fit(tf)\r\ntfidfIgnore = idfIgnore.transform(tf)","user":"anonymous","dateUpdated":"2018-09-12T20:54:24+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"Fail to execute line 7: idf = IDF().fit(tf)\r\nTraceback (most recent call last):\n  File \"/tmp/zeppelin_pyspark-4343385658997875626.py\", line 375, in <module>\n    exec(code, _zcUserQueryNameSpace)\n  File \"<stdin>\", line 7, in <module>\n  File \"/archive_spark/spark-2.2.1-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/mllib/feature.py\", line 572, in fit\n    raise TypeError(\"dataset should be an RDD of term frequency vectors\")\nTypeError: dataset should be an RDD of term frequency vectors\n"}]},"apps":[],"jobName":"paragraph_1536783901547_-213316666","id":"20180912-202501_1960510059","dateCreated":"2018-09-12T20:25:01+0000","dateStarted":"2018-09-12T20:54:24+0000","dateFinished":"2018-09-12T20:54:24+0000","status":"ERROR","progressUpdateIntervalMs":500,"$$hashKey":"object:238"},{"text":"%spark.pyspark\ntf.cache()","user":"anonymous","dateUpdated":"2018-09-12T20:54:18+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"scala","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"Fail to execute line 1: tf.cache()\nTraceback (most recent call last):\n  File \"/tmp/zeppelin_pyspark-4343385658997875626.py\", line 380, in <module>\n    exec(code, _zcUserQueryNameSpace)\n  File \"<stdin>\", line 1, in <module>\nAttributeError: 'SparseVector' object has no attribute 'cache'\n"}]},"apps":[],"jobName":"paragraph_1536784382749_-518739227","id":"20180912-203302_291838459","dateCreated":"2018-09-12T20:33:02+0000","dateStarted":"2018-09-12T20:54:18+0000","dateFinished":"2018-09-12T20:54:18+0000","status":"ERROR","progressUpdateIntervalMs":500,"$$hashKey":"object:239"},{"text":"%spark.pyspark\n","user":"anonymous","dateUpdated":"2018-09-12T20:33:08+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{}},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1536784388557_-1011451660","id":"20180912-203308_1390594156","dateCreated":"2018-09-12T20:33:08+0000","status":"READY","progressUpdateIntervalMs":500,"$$hashKey":"object:240"}],"name":"WordCount","id":"2DQ2HZAPP","noteParams":{},"noteForms":{},"angularObjects":{"spark:shared_process":[]},"config":{"isZeppelinNotebookCronEnable":false,"looknfeel":"default","personalizedMode":"false"},"info":{}}