Skip to content

Commit

Permalink
Merge pull request #2 from experius/release/0.2.0
Browse files Browse the repository at this point in the history
Release/0.2.0
  • Loading branch information
egordm authored Feb 25, 2020
2 parents f1cdf6e + 16393a3 commit f2d38da
Show file tree
Hide file tree
Showing 9 changed files with 217 additions and 44 deletions.
37 changes: 36 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,39 @@ cp .env.example .env
# Edit default settings
docker-compose up -d --build
```
```

# Commands
### Cache
Handles caching of pages associated to given website
```
Usage: crawl.py cache [OPTIONS] WEBSITE_IDS
Options:
--follow_next BOOLEAN Follows rel-next links if enabled
--recache BOOLEAN Recached all pages instead of not yet cached ones
--use_queue BOOLEAN Cache urls from the queue instead of the sitemap
--load BOOLEAN Whether already loaded urls should be scraped instead
--help Show this message and exit.
```

### Clean
Handles cleaning of the dashboard queue
```
Usage: crawl.py clean [OPTIONS] WEBSITE_IDS
Options:
--help Show this message and exit.
```

# Examples
```
# Cache the sitemap of website 1
docker-compose run cachewarmer cache 1
# Cache requests in queue for websites 1 and 2
dc run cachewarmer cache 1,2 use_queue=true
# Clean the queue for websites 1 and 2
docker-compose run cachewarmer clean 1,2
```
57 changes: 52 additions & 5 deletions crawl.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,58 @@
#!/usr/bin/python
import os, sys
import os, sys, logging

from seosnap_cachewarmer import logging
from scrapy.cmdline import execute
import click
from functools import reduce
from dotenv import load_dotenv
from scrapy.cmdline import execute
from scrapy.utils.log import configure_logging

from seosnap_cachewarmer.service import SeosnapService

load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '.env'))
configure_logging(install_root_handler=False)
logging.basicConfig(
filename=os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs/cachewarmer.log'),
level=os.getenv('CACHEWARMER_LOG_LEVEL')
)


@click.group()
def cli():
pass


@cli.command()
@click.argument('website_ids')
def load(website_ids: str, **args):
for website_id in website_ids.split(','):
click.echo(f'Loading website: {website_id}')
arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()])
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)


@cli.command()
@click.argument('website_ids')
@click.option('--follow_next', type=bool, default=True, help='Follows rel-next links if enabled')
@click.option('--recache', type=bool, default=True, help='Recached all pages instead of not yet cached ones')
@click.option('--use_queue', type=bool, default=False, help='Cache urls from the queue instead of the sitemap')
@click.option('--load', type=bool, default=False, help='Whether already loaded urls should be scraped instead')
def cache(website_ids: str, **args):
for website_id in website_ids.split(','):
click.echo(f'Caching website: {website_id}')
arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()])
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)


@cli.command()
@click.argument('website_ids')
def clean(website_ids: str):
service = SeosnapService()
for website_id in website_ids.split(','):
service.clean_queue(int(website_id))
logging.info(f'Cleaned the queue for website: {website_id}')
click.echo(f'Cleaned the queue for websites: {website_ids}')


if len(sys.argv) < 2: raise Exception('Missing website_id as argument')
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={sys.argv[1]}'] + sys.argv[2:])
if __name__ == '__main__':
cli()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
scrapy
coreapi==2.3.3
python-dotenv==0.10.3
click==7.0
17 changes: 16 additions & 1 deletion seosnap_cachewarmer/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
class SeosnapItemExporter(BaseItemExporter):
website_id: int
service: SeosnapService
use_queue: bool
buffer: List[dict]
buffer_size: int = int(os.getenv('CACHEWARMER_BUFFER_SIZE', 50))

def __init__(self, website_id, buffer_size=None, **kwargs):
def __init__(self, website_id, use_queue=False, buffer_size=None, **kwargs):
super().__init__(**kwargs)
self.use_queue = use_queue
if buffer_size: self.buffer_size = buffer_size
self.website_id = website_id

Expand All @@ -31,5 +33,18 @@ def export_item(self, item):

def flush(self):
if len(self.buffer) > 0:
if self.use_queue: self.flush_queue()
self.service.update_pages(self.website_id, self.buffer)
self.buffer = []

def flush_queue(self):
items = [
{
'page': {
'address': item['address']
},
'status': 'completed' if item['status_code'] // 200 == 1 else 'failed'
}
for item in self.buffer
]
self.service.update_queue(self.website_id, items)
10 changes: 6 additions & 4 deletions seosnap_cachewarmer/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@

class CacheServerMiddleware(object):
def process_request(self, request: Request, spider: SeosnapSpider):
state = spider.state

if CACHE_REQUEST_FLAG not in request.meta \
and request.url not in spider.sitemap_urls \
and request.url not in state.sitemap_urls() \
and not request.url.endswith('sitemap.xml') \
and spider.cacheserver_url:
and state.cacheserver_url:
request.meta[CACHE_REQUEST_FLAG] = True
return request.replace(
url=f'{spider.cacheserver_url}/{request.url}',
method='PUT' if spider.recache else 'GET'
url=f'{state.cacheserver_url}/{request.url}',
method='PUT' if state.recache else 'GET'
)
return None
2 changes: 1 addition & 1 deletion seosnap_cachewarmer/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def from_crawler(cls, crawler):
return pipeline

def spider_opened(self, spider: SeosnapSpider):
self.exporter = SeosnapItemExporter(spider.website_id)
self.exporter = SeosnapItemExporter(spider.state.website_id, spider.state.use_queue)
self.exporter.start_exporting()

def spider_closed(self, spider):
Expand Down
21 changes: 19 additions & 2 deletions seosnap_cachewarmer/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,29 @@ def __init__(self) -> None:
))
self.schema = self.client.get(os.getenv('API_URL'))

def get_website(self, website_id: int):
def get_website(self, website_id: int) -> dict:
action = ["api", "websites", "read"]
params = {"version": os.getenv('API_VER'), "id": website_id}
return self.client.action(self.schema, action, params=params)

def update_pages(self, website_id: int, pages: List[dict]):
def update_pages(self, website_id: int, pages: List[dict]) -> List[dict]:
action = ["api", "websites", "pages", "update_pages"]
params = {"version": os.getenv('API_VER'), "website_id": website_id, "items": pages}
return self.client.action(self.schema, action, params=params)

def get_queue(self, website_id: int) -> dict:
action = ["api", "websites", "queue_0"]
params = {"version": os.getenv('API_VER'), "website_id": website_id}
return self.client.action(self.schema, action, params=params)

def update_queue(self, website_id: int, queue_items: List[dict]) -> List[dict]:
action = ['api', 'websites', 'queue', 'update_queue']
params = {"version": os.getenv('API_VER'), "website_id": website_id, "items": queue_items}
return self.client.action(self.schema, action, params=params)

def clean_queue(self, website_id: int) -> List[dict]:
action = ['api', 'websites', 'queue', 'clean_queue']
params = {"version": os.getenv('API_VER'), "website_id": website_id}
return self.client.action(self.schema, action, params=params)


45 changes: 15 additions & 30 deletions seosnap_cachewarmer/spider.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import os
import urllib.parse as urllib
from typing import Dict, List
Expand All @@ -6,56 +7,40 @@
from scrapy.http import Response
from scrapy.spiders import SitemapSpider

from seosnap_cachewarmer.service import SeosnapService
from seosnap_cachewarmer.state import SeosnapState


class SeosnapSpider(SitemapSpider):
website: dict
website_id: int
follow_next: bool
recache: bool

cacheserver_url: str
service: SeosnapService
extract_fields: Dict[str, str]
other_pages: List[str]

state: SeosnapState
name = 'Seosnap'

def __init__(self, website_id, follow_next=True, recache=True) -> None:
self.service = SeosnapService()
self.website_id = website_id
self.follow_next = recache not in ['false', '0']
self.recache = recache not in ['false', '0']
self.cacheserver_url = os.getenv('CACHEWARMER_CACHE_SERVER_URL').rstrip('/')
self.website = self.service.get_website(self.website_id)

self.name = f'Cachewarm: {self.website["name"]}'
self.other_pages = [self.website["domain"]]
self.extract_fields = {field['name']: field["css_selector"] for field in self.website["extract_fields"]}
sitemap_urls = [self.website["sitemap"]]
super().__init__(sitemap_urls=sitemap_urls)
def __init__(self, *args, **kwargs) -> None:
self.state = SeosnapState(*args, **kwargs)
self.name = self.state.get_name()
super().__init__(sitemap_urls=self.state.sitemap_urls())

def start_requests(self):
requests = [Request(url, self.parse) for url in self.other_pages]
requests += list(super().start_requests())
return requests
if self.state.load:
return (Request(url, self.parse) for url in self.state.get_load_urls())
else:
extra_urls = (Request(url, self.parse) for url in self.state.extra_pages())
return itertools.chain(extra_urls, super().start_requests())

def parse(self, response: Response):
data = {
name: response.css(selector).extract_first()
for name, selector in self.extract_fields.items()
for name, selector in self.state.extract_fields.items()
}

# Follow next links
if self.follow_next:
if self.state.follow_next:
rel_next_url = response.css('link[rel="next"]::attr(href), a[rel="next"]::attr(href)').extract_first()
if rel_next_url is not None:
data['rel_next_url'] = rel_next_url
yield response.follow(rel_next_url, callback=self.parse)

# Strip cacheserver from the url if possible
url = response.url[len(self.cacheserver_url):].lstrip('/')
url = response.url[len(self.state.cacheserver_url):].lstrip('/')
url = urllib.urlparse(url)
url = urllib.urlunparse(('', '', url.path, url.params, url.query, ''))

Expand Down
71 changes: 71 additions & 0 deletions seosnap_cachewarmer/state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import requests
from typing import Dict, Union, Iterable
import urllib.parse as urllib

from seosnap_cachewarmer.service import SeosnapService


class SeosnapState:
website: dict
website_id: int
follow_next: bool
recache: bool
use_queue: bool
load: bool

cacheserver_url: str
service: SeosnapService
extract_fields: Dict[str, str]

def __init__(self, website_id, follow_next=True, recache=True, use_queue=False, load=False) -> None:
self.service = SeosnapService()
self.website_id = website_id
self.use_queue = parse_bool(use_queue)
self.load = parse_bool(load)
self.follow_next = parse_bool(follow_next) and not self.use_queue and not self.load
self.recache = parse_bool(recache) and not self.load

self.cacheserver_url = os.getenv('CACHEWARMER_CACHE_SERVER_URL').rstrip('/')
self.website = self.service.get_website(self.website_id)
self.extract_fields = {field['name']: field["css_selector"] for field in self.website["extract_fields"]}

def get_name(self) -> str:
return f'Cachewarm: {self.website["name"]}'

def sitemap_urls(self) -> Iterable[str]:
if not self.use_queue:
yield self.website["sitemap"]

def extra_pages(self) -> Iterable[str]:
if not self.use_queue:
yield self.website["domain"]
else:
for url in self.get_queue(): yield url

def get_queue(self) -> Iterable[str]:
# Retrieve queue items while queue is not empty
uri = urllib.urlparse(self.website['domain'])
root_domain = f'{uri.scheme}://{uri.netloc}'
while True:
items = self.service.get_queue(self.website_id)
# Empty queue
if len(items) == 0: break

for item in items:
path = item['page']['address']
yield f'{root_domain}{path}'

def get_load_urls(self) -> Iterable[str]:
uri = urllib.urlparse(self.website['domain'])
root_domain = f'{uri.scheme}://{uri.netloc}'
response = requests.get(urllib.urljoin(self.cacheserver_url, f'/list/{root_domain}'))
if response.status_code // 200 != 1: yield

urls = response.text.splitlines()
for url in urls: yield url


def parse_bool(s: Union[str, bool]) -> bool:
if isinstance(s, bool): return s
return s not in ['false', 'False', '0']

0 comments on commit f2d38da

Please sign in to comment.