Merge pull request #2 from experius/release/0.2.0

Release/0.2.0
experius · Feb 25, 2020 · f2d38da · f2d38da
2 parents f1cdf6e + 16393a3
commit f2d38da
Show file tree

Hide file tree

Showing 9 changed files with 217 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -5,4 +5,39 @@ cp .env.example .env
 # Edit default settings
 
 docker-compose up -d --build
-```
+```
+
+# Commands
+### Cache
+Handles caching of pages associated to given website
+```
+Usage: crawl.py cache [OPTIONS] WEBSITE_IDS
+
+Options:
+  --follow_next BOOLEAN  Follows rel-next links if enabled
+  --recache BOOLEAN      Recached all pages instead of not yet cached ones
+  --use_queue BOOLEAN    Cache urls from the queue instead of the sitemap
+  --load BOOLEAN         Whether already loaded urls should be scraped instead
+  --help                 Show this message and exit.
+```
+
+### Clean
+Handles cleaning of the dashboard queue
+```
+Usage: crawl.py clean [OPTIONS] WEBSITE_IDS
+
+Options:
+  --help  Show this message and exit.
+```
+
+# Examples
+```
+# Cache the sitemap of website 1
+docker-compose run cachewarmer cache 1
+
+# Cache requests in queue for websites 1 and 2
+ dc run cachewarmer cache 1,2 use_queue=true
+
+# Clean the queue for websites 1 and 2
+docker-compose run cachewarmer clean 1,2
+```
diff --git a/crawl.py b/crawl.py
@@ -1,11 +1,58 @@
 #!/usr/bin/python
-import os, sys
+import os, sys, logging
 
-from seosnap_cachewarmer import logging
-from scrapy.cmdline import execute
+import click
+from functools import reduce
 from dotenv import load_dotenv
+from scrapy.cmdline import execute
+from scrapy.utils.log import configure_logging
+
+from seosnap_cachewarmer.service import SeosnapService
 
 load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '.env'))
+configure_logging(install_root_handler=False)
+logging.basicConfig(
+    filename=os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs/cachewarmer.log'),
+    level=os.getenv('CACHEWARMER_LOG_LEVEL')
+)
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+@click.argument('website_ids')
+def load(website_ids: str, **args):
+    for website_id in website_ids.split(','):
+        click.echo(f'Loading website: {website_id}')
+        arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()])
+        execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)
+
+
+@cli.command()
+@click.argument('website_ids')
+@click.option('--follow_next', type=bool, default=True, help='Follows rel-next links if enabled')
+@click.option('--recache', type=bool, default=True, help='Recached all pages instead of not yet cached ones')
+@click.option('--use_queue', type=bool, default=False, help='Cache urls from the queue instead of the sitemap')
+@click.option('--load', type=bool, default=False, help='Whether already loaded urls should be scraped instead')
+def cache(website_ids: str, **args):
+    for website_id in website_ids.split(','):
+        click.echo(f'Caching website: {website_id}')
+        arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()])
+        execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)
+
+
+@cli.command()
+@click.argument('website_ids')
+def clean(website_ids: str):
+    service = SeosnapService()
+    for website_id in website_ids.split(','):
+        service.clean_queue(int(website_id))
+        logging.info(f'Cleaned the queue for website: {website_id}')
+    click.echo(f'Cleaned the queue for websites: {website_ids}')
+
 
-if len(sys.argv) < 2: raise Exception('Missing website_id as argument')
-execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={sys.argv[1]}'] + sys.argv[2:])
+if __name__ == '__main__':
+    cli()
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 scrapy
 coreapi==2.3.3
 python-dotenv==0.10.3
+click==7.0
diff --git a/seosnap_cachewarmer/exporter.py b/seosnap_cachewarmer/exporter.py
@@ -9,11 +9,13 @@
 class SeosnapItemExporter(BaseItemExporter):
     website_id: int
     service: SeosnapService
+    use_queue: bool
     buffer: List[dict]
     buffer_size: int = int(os.getenv('CACHEWARMER_BUFFER_SIZE', 50))
 
-    def __init__(self, website_id, buffer_size=None, **kwargs):
+    def __init__(self, website_id, use_queue=False, buffer_size=None, **kwargs):
         super().__init__(**kwargs)
+        self.use_queue = use_queue
         if buffer_size: self.buffer_size = buffer_size
         self.website_id = website_id
 
@@ -31,5 +33,18 @@ def export_item(self, item):
 
     def flush(self):
         if len(self.buffer) > 0:
+            if self.use_queue: self.flush_queue()
             self.service.update_pages(self.website_id, self.buffer)
             self.buffer = []
+
+    def flush_queue(self):
+        items = [
+            {
+                'page': {
+                    'address': item['address']
+                },
+                'status': 'completed' if item['status_code'] // 200 == 1 else 'failed'
+            }
+            for item in self.buffer
+        ]
+        self.service.update_queue(self.website_id, items)
diff --git a/seosnap_cachewarmer/middleware.py b/seosnap_cachewarmer/middleware.py
@@ -7,13 +7,15 @@
 
 class CacheServerMiddleware(object):
     def process_request(self, request: Request, spider: SeosnapSpider):
+        state = spider.state
+
         if CACHE_REQUEST_FLAG not in request.meta \
-                and request.url not in spider.sitemap_urls \
+                and request.url not in state.sitemap_urls() \
                 and not request.url.endswith('sitemap.xml') \
-                and spider.cacheserver_url:
+                and state.cacheserver_url:
             request.meta[CACHE_REQUEST_FLAG] = True
             return request.replace(
-                url=f'{spider.cacheserver_url}/{request.url}',
-                method='PUT' if spider.recache else 'GET'
+                url=f'{state.cacheserver_url}/{request.url}',
+                method='PUT' if state.recache else 'GET'
             )
         return None
diff --git a/seosnap_cachewarmer/pipelines.py b/seosnap_cachewarmer/pipelines.py
@@ -15,7 +15,7 @@ def from_crawler(cls, crawler):
         return pipeline
 
     def spider_opened(self, spider: SeosnapSpider):
-        self.exporter = SeosnapItemExporter(spider.website_id)
+        self.exporter = SeosnapItemExporter(spider.state.website_id, spider.state.use_queue)
         self.exporter.start_exporting()
 
     def spider_closed(self, spider):

diff --git a/seosnap_cachewarmer/service.py b/seosnap_cachewarmer/service.py
@@ -17,12 +17,29 @@ def __init__(self) -> None:
         ))
         self.schema = self.client.get(os.getenv('API_URL'))
 
-    def get_website(self, website_id: int):
+    def get_website(self, website_id: int) -> dict:
         action = ["api", "websites", "read"]
         params = {"version": os.getenv('API_VER'), "id": website_id}
         return self.client.action(self.schema, action, params=params)
 
-    def update_pages(self, website_id: int, pages: List[dict]):
+    def update_pages(self, website_id: int, pages: List[dict]) -> List[dict]:
         action = ["api", "websites", "pages", "update_pages"]
         params = {"version": os.getenv('API_VER'), "website_id": website_id, "items": pages}
         return self.client.action(self.schema, action, params=params)
+
+    def get_queue(self, website_id: int) -> dict:
+        action = ["api", "websites", "queue_0"]
+        params = {"version": os.getenv('API_VER'), "website_id": website_id}
+        return self.client.action(self.schema, action, params=params)
+
+    def update_queue(self, website_id: int, queue_items: List[dict]) -> List[dict]:
+        action = ['api', 'websites', 'queue', 'update_queue']
+        params = {"version": os.getenv('API_VER'), "website_id": website_id, "items": queue_items}
+        return self.client.action(self.schema, action, params=params)
+
+    def clean_queue(self, website_id: int) -> List[dict]:
+        action = ['api', 'websites', 'queue', 'clean_queue']
+        params = {"version": os.getenv('API_VER'), "website_id": website_id}
+        return self.client.action(self.schema, action, params=params)
+
+
diff --git a/seosnap_cachewarmer/spider.py b/seosnap_cachewarmer/spider.py
@@ -1,3 +1,4 @@
+import itertools
 import os
 import urllib.parse as urllib
 from typing import Dict, List
@@ -6,56 +7,40 @@
 from scrapy.http import Response
 from scrapy.spiders import SitemapSpider
 
-from seosnap_cachewarmer.service import SeosnapService
+from seosnap_cachewarmer.state import SeosnapState
 
 
 class SeosnapSpider(SitemapSpider):
-    website: dict
-    website_id: int
-    follow_next: bool
-    recache: bool
-
-    cacheserver_url: str
-    service: SeosnapService
-    extract_fields: Dict[str, str]
-    other_pages: List[str]
-
+    state: SeosnapState
     name = 'Seosnap'
 
-    def __init__(self, website_id, follow_next=True, recache=True) -> None:
-        self.service = SeosnapService()
-        self.website_id = website_id
-        self.follow_next = recache not in ['false', '0']
-        self.recache = recache not in ['false', '0']
-        self.cacheserver_url = os.getenv('CACHEWARMER_CACHE_SERVER_URL').rstrip('/')
-        self.website = self.service.get_website(self.website_id)
-
-        self.name = f'Cachewarm: {self.website["name"]}'
-        self.other_pages = [self.website["domain"]]
-        self.extract_fields = {field['name']: field["css_selector"] for field in self.website["extract_fields"]}
-        sitemap_urls = [self.website["sitemap"]]
-        super().__init__(sitemap_urls=sitemap_urls)
+    def __init__(self, *args, **kwargs) -> None:
+        self.state = SeosnapState(*args, **kwargs)
+        self.name = self.state.get_name()
+        super().__init__(sitemap_urls=self.state.sitemap_urls())
 
     def start_requests(self):
-        requests = [Request(url, self.parse) for url in self.other_pages]
-        requests += list(super().start_requests())
-        return requests
+        if self.state.load:
+            return (Request(url, self.parse) for url in self.state.get_load_urls())
+        else:
+            extra_urls = (Request(url, self.parse) for url in self.state.extra_pages())
+            return itertools.chain(extra_urls, super().start_requests())
 
     def parse(self, response: Response):
         data = {
             name: response.css(selector).extract_first()
-            for name, selector in self.extract_fields.items()
+            for name, selector in self.state.extract_fields.items()
         }
 
         # Follow next links
-        if self.follow_next:
+        if self.state.follow_next:
             rel_next_url = response.css('link[rel="next"]::attr(href), a[rel="next"]::attr(href)').extract_first()
             if rel_next_url is not None:
                 data['rel_next_url'] = rel_next_url
                 yield response.follow(rel_next_url, callback=self.parse)
 
         # Strip cacheserver from the url if possible
-        url = response.url[len(self.cacheserver_url):].lstrip('/')
+        url = response.url[len(self.state.cacheserver_url):].lstrip('/')
         url = urllib.urlparse(url)
         url = urllib.urlunparse(('', '', url.path, url.params, url.query, ''))
 

diff --git a/seosnap_cachewarmer/state.py b/seosnap_cachewarmer/state.py
@@ -0,0 +1,71 @@
+import os
+import requests
+from typing import Dict, Union, Iterable
+import urllib.parse as urllib
+
+from seosnap_cachewarmer.service import SeosnapService
+
+
+class SeosnapState:
+    website: dict
+    website_id: int
+    follow_next: bool
+    recache: bool
+    use_queue: bool
+    load: bool
+
+    cacheserver_url: str
+    service: SeosnapService
+    extract_fields: Dict[str, str]
+
+    def __init__(self, website_id, follow_next=True, recache=True, use_queue=False, load=False) -> None:
+        self.service = SeosnapService()
+        self.website_id = website_id
+        self.use_queue = parse_bool(use_queue)
+        self.load = parse_bool(load)
+        self.follow_next = parse_bool(follow_next) and not self.use_queue and not self.load
+        self.recache = parse_bool(recache) and not self.load
+
+        self.cacheserver_url = os.getenv('CACHEWARMER_CACHE_SERVER_URL').rstrip('/')
+        self.website = self.service.get_website(self.website_id)
+        self.extract_fields = {field['name']: field["css_selector"] for field in self.website["extract_fields"]}
+
+    def get_name(self) -> str:
+        return f'Cachewarm: {self.website["name"]}'
+
+    def sitemap_urls(self) -> Iterable[str]:
+        if not self.use_queue:
+            yield self.website["sitemap"]
+
+    def extra_pages(self) -> Iterable[str]:
+        if not self.use_queue:
+            yield self.website["domain"]
+        else:
+            for url in self.get_queue(): yield url
+
+    def get_queue(self) -> Iterable[str]:
+        # Retrieve queue items while queue is not empty
+        uri = urllib.urlparse(self.website['domain'])
+        root_domain = f'{uri.scheme}://{uri.netloc}'
+        while True:
+            items = self.service.get_queue(self.website_id)
+            # Empty queue
+            if len(items) == 0: break
+
+            for item in items:
+                path = item['page']['address']
+                yield f'{root_domain}{path}'
+
+    def get_load_urls(self) -> Iterable[str]:
+        uri = urllib.urlparse(self.website['domain'])
+        root_domain = f'{uri.scheme}://{uri.netloc}'
+        response = requests.get(urllib.urljoin(self.cacheserver_url, f'/list/{root_domain}'))
+        if response.status_code // 200 != 1: yield
+
+        urls = response.text.splitlines()
+        for url in urls: yield url
+
+
+def parse_bool(s: Union[str, bool]) -> bool:
+    if isinstance(s, bool): return s
+    return s not in ['false', 'False', '0']