From ce0fd08d89efd89c6cec15a89a923a1c920f44f9 Mon Sep 17 00:00:00 2001 From: Prasanth Date: Fri, 15 Sep 2017 17:07:33 +0530 Subject: [PATCH 1/7] draft version --- crawler.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..11f8345 --- /dev/null +++ b/crawler.py @@ -0,0 +1,22 @@ +import scrapy +links = [] +codes_list = [] +class Spider(scrapy.Spider): + name = 'spider' + start_urls = ['http://localhost:8080'] + def parse(self, response): + for codes in response.css('div.codes > h1 ::text'): + codes_list.append(codes.extract()) + for next_page in response.css('div > a.link'): + if(next_page.extract() in links): + yield + else: + links.append(next_page.extract()) + yield response.follow(next_page, self.parse) + + def closed(reason, r2): + print len(links) + print len(codes_list) + print "====================" + codes_list.sort() + print codes_list.pop(0) From 4d2b30b627aace5ad150f242b2dd6c2a68cb5cee Mon Sep 17 00:00:00 2001 From: Prasanth Date: Fri, 15 Sep 2017 17:11:05 +0530 Subject: [PATCH 2/7] cleaned up --- crawler.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/crawler.py b/crawler.py index 11f8345..290f1ea 100644 --- a/crawler.py +++ b/crawler.py @@ -15,8 +15,5 @@ def parse(self, response): yield response.follow(next_page, self.parse) def closed(reason, r2): - print len(links) - print len(codes_list) - print "====================" codes_list.sort() print codes_list.pop(0) From 6799c3157219ae2333f9a1cec6c20c011a1c48e5 Mon Sep 17 00:00:00 2001 From: Prasanth Date: Fri, 15 Sep 2017 18:40:09 +0530 Subject: [PATCH 3/7] second version --- crawler.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/crawler.py b/crawler.py index 290f1ea..2b2f89f 100644 --- a/crawler.py +++ b/crawler.py @@ -1,19 +1,28 @@ import scrapy -links = [] -codes_list = [] +from datetime import datetime +timenow=datetime.now() +from heapq import heappush, heappop +code_list = [] + class Spider(scrapy.Spider): - name = 'spider' - start_urls = ['http://localhost:8080'] - def parse(self, response): - for codes in response.css('div.codes > h1 ::text'): - codes_list.append(codes.extract()) - for next_page in response.css('div > a.link'): - if(next_page.extract() in links): - yield - else: - links.append(next_page.extract()) - yield response.follow(next_page, self.parse) + name = 'spider' + start_urls = ['http://localhost:8080'] + custom_settings = { + 'LOG_ENABLED': 'false', + 'CONCURRENT_REQUESTS': 4, + 'CONCURRENT_REQUESTS_PER_DOMAIN': 4 + } + def __init__(self, url=None): + self.something = url + + def parse(self, response): + local_codes = [] + for codes in response.css('div.codes > h1 ::text'): + heappush(local_codes, codes.extract()) + yield heappush(code_list, heappop(local_codes)) + for next_page in response.css('a'): + yield response.follow(next_page,callback=self.parse) - def closed(reason, r2): - codes_list.sort() - print codes_list.pop(0) + def closed(self, reason): + print heappop(code_list) + print datetime.now() - timenow From c43074f6189a11f4090372c1844440f62f2edb68 Mon Sep 17 00:00:00 2001 From: Prasanth Date: Sat, 16 Sep 2017 15:39:52 +0530 Subject: [PATCH 4/7] crawler with a sample test. --- crawler.py | 7 ++----- crawler.test.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 crawler.test.py diff --git a/crawler.py b/crawler.py index 2b2f89f..1e5e3f8 100644 --- a/crawler.py +++ b/crawler.py @@ -1,15 +1,13 @@ import scrapy -from datetime import datetime -timenow=datetime.now() from heapq import heappush, heappop code_list = [] -class Spider(scrapy.Spider): +class MyBaseSpider(scrapy.Spider): name = 'spider' start_urls = ['http://localhost:8080'] custom_settings = { 'LOG_ENABLED': 'false', - 'CONCURRENT_REQUESTS': 4, + 'CONCURRENT_REQUESTS': 2, 'CONCURRENT_REQUESTS_PER_DOMAIN': 4 } def __init__(self, url=None): @@ -25,4 +23,3 @@ def parse(self, response): def closed(self, reason): print heappop(code_list) - print datetime.now() - timenow diff --git a/crawler.test.py b/crawler.test.py new file mode 100644 index 0000000..596b520 --- /dev/null +++ b/crawler.test.py @@ -0,0 +1,18 @@ +import unittest +from scrapy.crawler import CrawlerProcess +from crawler import MyBaseSpider + + +crawlerProcess = CrawlerProcess() +# crawlerProcess.install() +# crawlerProcess.configure() + + +class TestStringMethods(unittest.TestCase): + + def test_isupper(self): + crawlerProcess.crawl(MyBaseSpider) + crawlerProcess.start() + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 58ce92a85ce4474a289bcd66ce4c5bdab278cd43 Mon Sep 17 00:00:00 2001 From: Prasanth Date: Sat, 16 Sep 2017 16:17:28 +0530 Subject: [PATCH 5/7] updated travis file. --- .travis.yml | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6f57fbc..8434d19 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,16 @@ sudo: false -language: node_js -cache: - yarn: true - directories: - - node_modules -notifications: - email: false -node_js: - - 'stable' +language: + - python + - node_js before_script: - - npm test -branches: - except: - - /^v\d+\.\d+\.\d+$/ + - npm install +python: + - 3.5 +install: + - pip install -r requirements.txt +scripts: + - cd dream11/ + - scripyd & + - cd .. + - npm start & + - curl http://localhost:6800/schedule.json -d project=default -d spider=linkspider \ No newline at end of file From f7f08b02e76708e75a77fe27290a0d9a44a12b51 Mon Sep 17 00:00:00 2001 From: Prasanth Date: Sat, 16 Sep 2017 16:21:41 +0530 Subject: [PATCH 6/7] added requirements file --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..42d97c9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +Scrapy=1.4.0 \ No newline at end of file From 3543e47d6be5d7c6d21701d4e7d0c72bb2857fc2 Mon Sep 17 00:00:00 2001 From: Prasanth Date: Sat, 16 Sep 2017 16:25:00 +0530 Subject: [PATCH 7/7] updated the req. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 42d97c9..a247d0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -Scrapy=1.4.0 \ No newline at end of file +Scrapy == 1.4.0 \ No newline at end of file