Skip to content

Commit

Permalink
cleaned up communities classifier notebook
Browse files Browse the repository at this point in the history
Signed-off-by: Krzysztof Nowak <[email protected]>
  • Loading branch information
Krzysztof Nowak committed May 30, 2018
1 parent 8639c93 commit 69c2ac4
Showing 1 changed file with 37 additions and 181 deletions.
218 changes: 37 additions & 181 deletions communities-sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
Expand All @@ -27,10 +25,8 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(\"./comms.json\", \"r\") as fp:\n",
Expand All @@ -39,10 +35,8 @@
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spam = ['lybinh', 'tieutieuhiep480549',]\n",
Expand Down Expand Up @@ -71,10 +65,8 @@
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"previously_deleted_spam = ['5400', 'obatsakitmaagyangmanjurdanseringdicari', 'agen-poker', 'boyaqq', 'cekipokernet-agen-poker-online-android-uang-asli-terbaik-indonesia', 'tuanpoker', 'jeeptoto', 'test321', 'mycommunity', 'grosiramazonplus', 'ie', 'strangers_in_the_night', 'egames', 'zenodo-testing', 'the-biggest-online-sports-betting-site-in-malaysia', 'shira', 'thethaoqq188', 'searchengineoptimization', 'pokeronlineterpercaya', 'best-gaming-laptops', 'loto188', 'bandarceme', 'review3', 'domino99', 'sayangseo', 'bongdaqq188', 'jnepoker', 'onlineslotqq101-com-slot-machine-games', 'test-123', 'onlinecasinoqq', 'agen-judi-poker-online-terbaik', 'wargakartu', 'cemarapoker-situs-judi-poker-dan-domino-online-terpercaya-1', 'casino-website', 'sahabatkartucom', 'sahabatqqcasinocom', 'dewa_poker', 'kontesseo3', 'j', '11111111', 'oo22', 'ngentod', 'cemarapoker-situs-judi-poker-dan-domino-online-terpercaya-2506', 'qjoker', '043', 'slotqq188marimain', '002', '288', 'test1000', 'sahabatkartu-com-agen-poker-online', 'sahabatqq-casino', 'rajapoker', 'infojudi', 'sayangseo1', 'masterjudi88', 'ledstairnosingaustralia', 'partyvenueschicago', 'sportsqq288com-the-biggest-online-sports-betting-site-in-malaysia', 'kuramang', 'sahabatkartu', '021', 'interqqcom', '098', 'longxaodua', 'situsbandardanagenjudipokeruangasliterbesardanterpercayaindonesia', 'akifa_naila', 'masteragen', 'ajoqq', 'ichadinitraqq188', 'agen-poker-resmi-terpercaya-dan-terbaik', 'cemarapoker-situs-judi-poker-dan-domino-online-terpercaya', 'cemarapoker', 'interqq', '001', 'kapalpoker-com', 'kapal_poker', 'jeniusseo', '365365', '087', 'amour', 'pandawa', '034', 'casinovietqq288hh', 'trusted-live-casino-gambling-website-in-malaysia', 'sahabatqq-casino-agen-domino-99-dan-poker-online-terbesar-di-asia', 'pokerdominoqq-online-situs-agen-poker-domino-qq-online-terpercaya', 'pokerdominoseo', 'jadipuas1', '086', '12', 'sahabatqqcasinoagendomino99danpokeronlineterbesardiasia', '112', 'pokerqq288', 'qq288', 'rajapoker88-situs-agen-judi-poker-bandar-domino-qq-online-terpercaya', 'putrilaura', '553', 'sahabatkartu-com-agen-poker-domino-99-online-bandarq-terpercaya-indonesia', 'seo', 'sahabatqqcasino', '007', 'sarana-pelangi-agen-judi-domino-qq-bandar-poker-dan-bandar-qiu-qiu-99-terpercaya-seasia', 'saranapelangiagen', 'superbejoq', 'bolaqiuqiu', 'bandarjudi', 'casinoqq188', 'casinoqq288', 'casinovietqq101', 'casinovietqq188', 'casinovnqq188', 'qq188', 'casinovqq188', 'linda', '101', 'livecasinoonlineqq101', 'livecasinoonlineqq', 'nandalistiohadi', 'nha-cai-danh-bai-truc-tuyen-casino-viet-qq288', 'casino', 'casinowebsite', 'live-casino-website', 'the-best-live-casinos-site-in-malaysia', 'live-casino', 'online-casino', 'sahabatqqdotcasino', 'sarana118-com-agen-judi-sbobet-live-casino-dan-togel-online-terpercaya-seasia', 'sarana118-com', 'casinoq288', 'casino-e-game-hap-dan-nhat-hien-nay', 'livecasinoonline', 'alternatifnyasehat', 'bliherbal', 'obatherbalsatu', 'herbalufi', 'hendi', 'bbbbbbbbbbb222222222222222', 'diherbalamazonplus', 'buycbdoil', 'mooremike', 'mooremike3', 'frankcurtis04', 'tokobukuonline', '1', 'agendomino99', 'agen_togel_online_terpercaya_zodiaktoto', 'bettingqq101', 'betting101', 'sports', 'slotgames', 'androidslots', 'slot-games-online', 'slot', 'slotmachines', 'onlineslotqq101-slot-machine-games-free-slot-betting-website', 'slotqq188', 'slot-machine', 'online-sports-betting', 'football288-', 'bettingslotqq188', 'onlineslotqq1881', 'onlineslotqq188', 'onlineslotqq288', 'bolaqq188', 'cdbd288', 'ilottoqq188', 'qq188asia', 'qq188asia-best-online-sports-bookie-website-asia-top-free-bets-bookmaker', 'bettingonline', 'onlinebetting', 'betting', 'sportsbetting', 'bettingsports', 'thethaovqq188', 'songbai', '188', 'review_terpercaya', 'sportsbook', 'sahabatqqcasino-agen-domino-99-dan-poker-online-terbesar-di-asia', 'thaothaoonline', 'cuocbongdaqq188', 'auduongkhac', 'nhacaitructuyen', 'gameonline', 'comrang', 'blackjacktvietqq288', 'bolaqq288', 'menangbesarbolaqq288', 'songvedem', 'bongdaqq288', 'cemeqq288-com-situs-agen-judi-ceme-online-indonesia', 'situsjudiqq288', 'poker', 'baotichnhuoc', 'test42', 'thethao', 'thethaoqq288', 'qq101', 'agensportsbookqq101', 'bandarbolaqq101', 'slotgamevietqq101', '77', 'slot188', 'bwinqq', 'caridomino', 'judi-kartu-domino-online-cocok-menjadi-permainan-semua-orang', 'saranapelangiagenjudidominoqq', 'sejarahqq-net-agen-dominoqq-online-bandarq-terpercaya', 'wtcdomino-com', 'wtcdomino', 'agentqjoker', 'rickpetko91795', 'rajaseoweb', 'agen-judi-kartu-permainan-terlengkap', '112345', 'sports-betting', 'sellmyhousefastindianapolis', 'testing', 'casinoonlineqq', 'buestestcommunity', 'herbalkankerpayudara11', 'online-betting', 'simple-and-basic-tips-and-advises-to-improve-your-online-casino-gaming', 'bandar', 'naga388', 'depoqqnetagenbandarqdominoqiuqiu', 'sahabatqq1', 'toko4d', '097', 'subhajit', 'infobet99', 'obatfrigid', 'kue-lebaran', 'aa', 'smallbusiness', 'bandarpelangi2']\n",
Expand All @@ -83,30 +75,17 @@
},
{
"cell_type": "code",
"execution_count": 110,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"895"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"len(set(spam))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Communities which were removed by manually checked to not be spam\n",
Expand All @@ -115,10 +94,8 @@
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X = []\n",
Expand All @@ -131,20 +108,9 @@
},
{
"cell_type": "code",
"execution_count": 112,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"895"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"len([x for x in X if x['spam']])"
]
Expand All @@ -159,9 +125,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"X_maybespam = [x for x in X if not x['spam'] and\n",
Expand All @@ -174,9 +138,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"new_spam_ids = [x['id'] for x in X_maybespam]\n",
Expand All @@ -186,9 +148,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"for x in X_maybespam:\n",
Expand All @@ -204,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 165,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -240,10 +200,8 @@
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"acc = [idx for idx, (ref, pred) in enumerate(zip(y_test, y_pred)) if (ref, pred) == (False, True)]\n",
Expand All @@ -252,53 +210,18 @@
},
{
"cell_type": "code",
"execution_count": 141,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('agriprima',\n",
" 'Agriprima, Journal of Applied Agricultural Sciences',\n",
" '<p><strong>Agriprima,</strong>&nbsp;Journal of Applied Agricultural Sciences adalah Jurnal Ilmu Pertanian Terapan yang menjadi sarana bagi peneliti untuk mempublikasikan hasil penelitiannya dalam lingkup pemuliaan tanaman, bioteknologi tanaman, teknologi benih, perlindungan tanaman, dan kesuburan tanah.</p>\\r\\n\\r\\n<p>Agriprima diterbitkan oleh Jurusan Produksi Pertanian Politeknik Negeri Jember bekerjasama dengan Politeknik, Fakultas Pertanian serta Pusat Penelitian Kopi dan Kakao Indonesia.</p>\\r\\n'),\n",
" ('talkinmaths',\n",
" 'Talk in Mathematics',\n",
" '<p>This is a&nbsp;collection of transcriptions of&nbsp;mathematics classroom interactions. &nbsp;It is intended to grow over time and diversify to include a wider range of transcriptions, such as transcriptions of groups working on mathematics tasks, and transcriptions of lessons in subjects other than mathematics</p>\\r\\n'),\n",
" ('prueba', 'Prueba', '<p>test</p>\\r\\n')]"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"spammy_stuff"
]
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'curation_policy': '',\n",
" 'deleted_at': 'None',\n",
" 'description': '<p>Grup LSKK selalu membuka diri untuk bekerja sama dengan sebanyak mungkin mitra untuk mencapai pencapaian terbaik</p>\\r\\n',\n",
" 'id': 'lskk',\n",
" 'id_user': 23096,\n",
" 'page': '',\n",
" 'spam': True,\n",
" 'title': 'Lab. Sistem Kendali dan Komputer'}"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"[x for x in X if x['id'] == 'lskk'][0]"
]
Expand All @@ -312,10 +235,8 @@
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import LeaveOneOut, KFold\n",
Expand Down Expand Up @@ -359,23 +280,9 @@
},
{
"cell_type": "code",
"execution_count": 156,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Counter({(False, False): 1168,\n",
" (False, True): 10,\n",
" (True, False): 27,\n",
" (True, True): 868})"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"acc = [(ref, pred) for ref, pred in zip(y, res)]\n",
"Counter(acc)"
Expand All @@ -395,10 +302,8 @@
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"acc = [idx for idx, (ref, pred) in enumerate(zip(y, res)) if (ref, pred) == (False, True)]\n",
Expand All @@ -407,47 +312,9 @@
},
{
"cell_type": "code",
"execution_count": 160,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('docshyr',\n",
" 'ДокШир',\n",
" '<p>ДокШир створено задля безперешкодного поширення ініціатив руху Відкритого доступу серед українських фахівців бібліотечної справи.</p>\\r\\n'),\n",
" ('researchtools',\n",
" 'Research Tools Box By: Dr. Nader Ale Ebrahim',\n",
" '<p>This Topic is designed to assist students to aim at reducing the search time by increasing their knowledge to more effectively use the &quot;Research Tools&quot; which is available through the Net.</p>\\r\\n\\r\\n<p>Created by Nader Ale Ebrahim</p>\\r\\n'),\n",
" ('kne-test',\n",
" 'The Egyptian Chemical Society',\n",
" '<html>\\r\\n<head>\\r\\n<meta content=\"text/html; charset=ISO-8859-1\"\\r\\nhttp-equiv=\"content-type\">\\r\\n<title>About Journal Sidebar</title>\\r\\n</head>\\r\\n<body>\\r\\n<p style=\"font-family: Arial;\"><img style=\"width: 212px; height: 300px;\"\\r\\nalt=\"Egyptian Journal of Chemistry Produced\"\\r\\nsrc=\"http://www.knowledgee.com/new-platform/wp-content/uploads/2014/12/Picture1-212x300.jpg\"><br>\\r\\n</p>\\r\\n<p style=\"font-family: Arial;\">Egyptian Journal of Chemistry Produced\\r\\nand hosted on behalf Academy of Scientific Research and Technology\\r\\nInformatics Sector and Scientific&nbsp;Services The National Centre for\\r\\nInformation and Documentation (NIDOC), Dokki, Cairo, Egypt</p>\\r\\n<p style=\"font-family: Arial;\">Edited by:&nbsp;&nbsp;&nbsp; The\\r\\nEgyptian Chemical&nbsp; Society<br>\\r\\nISSN:&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; 0449-2285</p>\\r\\n<p>&nbsp;</p>\\r\\n</body>\\r\\n</html>'),\n",
" ('prueba', 'Prueba', '<p>test</p>\\r\\n'),\n",
" ('cobp',\n",
" 'The Complexity of Obesity Proceedings',\n",
" '<p>This is an open access conference journal according to the&nbsp;Norwegian Social Science&nbsp;Data Services.&nbsp;We publish proceedings that have been satisfactory presented at &quot;The Complexity of Obesity Conferences&quot; or related events.&nbsp;</p>\\r\\n'),\n",
" ('saranauthorunismuh',\n",
" 'LP3M Unismuh Makassar',\n",
" '<p>Lembaga Penenlitian, Pengembangan dan Pengabdian Masyarakat</p>\\r\\n'),\n",
" ('semfefu',\n",
" 'Школа экономики и менеджмента Дальневосточного федерального университета',\n",
" '<p><font><font>Школа экономики и менеджмента включает в себя 14 кафедр, реализующих широкий спектр образовательных программ по всем направлениям подготовки экономистов и менеджеров: 13 образовательных программ бакалавриата и 22 магистерских программы. </font><font>Школа активно развивает экономическое, технологическое и культурное сотрудничество со странами АТР и другими государствами. </font><font>Выпускники Школы востребованы в крупнейших российских и международных компаниях.</font></font></p>\\r\\n'),\n",
" ('talkinmaths',\n",
" 'Talk in Mathematics',\n",
" '<p>This is a&nbsp;collection of transcriptions of&nbsp;mathematics classroom interactions. &nbsp;It is intended to grow over time and diversify to include a wider range of transcriptions, such as transcriptions of groups working on mathematics tasks, and transcriptions of lessons in subjects other than mathematics</p>\\r\\n'),\n",
" ('agriprima',\n",
" 'Agriprima, Journal of Applied Agricultural Sciences',\n",
" '<p><strong>Agriprima,</strong>&nbsp;Journal of Applied Agricultural Sciences adalah Jurnal Ilmu Pertanian Terapan yang menjadi sarana bagi peneliti untuk mempublikasikan hasil penelitiannya dalam lingkup pemuliaan tanaman, bioteknologi tanaman, teknologi benih, perlindungan tanaman, dan kesuburan tanah.</p>\\r\\n\\r\\n<p>Agriprima diterbitkan oleh Jurusan Produksi Pertanian Politeknik Negeri Jember bekerjasama dengan Politeknik, Fakultas Pertanian serta Pusat Penelitian Kopi dan Kakao Indonesia.</p>\\r\\n'),\n",
" ('open-literature-reviews',\n",
" 'Open Literature Reviews',\n",
" '<p>This community is for all researchers who want to share the datasets from their systematic literature reviews or mapping studies. See format instructions on the about page.</p>\\r\\n')]"
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"spammy_stuff"
]
Expand All @@ -461,20 +328,9 @@
},
{
"cell_type": "code",
"execution_count": 166,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['2017_06_18_communities_spam.pkl']"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from sklearn.externals import joblib\n",
"joblib.dump(text_clf, '2017_06_18_communities_spam.pkl') "
Expand All @@ -497,7 +353,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
"version": "3.6.5"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 69c2ac4

Please sign in to comment.