-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from experius/release/0.2.0
Release/0.2.0
- Loading branch information
Showing
9 changed files
with
217 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,58 @@ | ||
#!/usr/bin/python | ||
import os, sys | ||
import os, sys, logging | ||
|
||
from seosnap_cachewarmer import logging | ||
from scrapy.cmdline import execute | ||
import click | ||
from functools import reduce | ||
from dotenv import load_dotenv | ||
from scrapy.cmdline import execute | ||
from scrapy.utils.log import configure_logging | ||
|
||
from seosnap_cachewarmer.service import SeosnapService | ||
|
||
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '.env')) | ||
configure_logging(install_root_handler=False) | ||
logging.basicConfig( | ||
filename=os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs/cachewarmer.log'), | ||
level=os.getenv('CACHEWARMER_LOG_LEVEL') | ||
) | ||
|
||
|
||
@click.group() | ||
def cli(): | ||
pass | ||
|
||
|
||
@cli.command() | ||
@click.argument('website_ids') | ||
def load(website_ids: str, **args): | ||
for website_id in website_ids.split(','): | ||
click.echo(f'Loading website: {website_id}') | ||
arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()]) | ||
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens) | ||
|
||
|
||
@cli.command() | ||
@click.argument('website_ids') | ||
@click.option('--follow_next', type=bool, default=True, help='Follows rel-next links if enabled') | ||
@click.option('--recache', type=bool, default=True, help='Recached all pages instead of not yet cached ones') | ||
@click.option('--use_queue', type=bool, default=False, help='Cache urls from the queue instead of the sitemap') | ||
@click.option('--load', type=bool, default=False, help='Whether already loaded urls should be scraped instead') | ||
def cache(website_ids: str, **args): | ||
for website_id in website_ids.split(','): | ||
click.echo(f'Caching website: {website_id}') | ||
arg_tokens = reduce(lambda x, y: x + y, [['-a', f'{k}={v}'] for k, v in args.items()]) | ||
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens) | ||
|
||
|
||
@cli.command() | ||
@click.argument('website_ids') | ||
def clean(website_ids: str): | ||
service = SeosnapService() | ||
for website_id in website_ids.split(','): | ||
service.clean_queue(int(website_id)) | ||
logging.info(f'Cleaned the queue for website: {website_id}') | ||
click.echo(f'Cleaned the queue for websites: {website_ids}') | ||
|
||
|
||
if len(sys.argv) < 2: raise Exception('Missing website_id as argument') | ||
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={sys.argv[1]}'] + sys.argv[2:]) | ||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
scrapy | ||
coreapi==2.3.3 | ||
python-dotenv==0.10.3 | ||
click==7.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import os | ||
import requests | ||
from typing import Dict, Union, Iterable | ||
import urllib.parse as urllib | ||
|
||
from seosnap_cachewarmer.service import SeosnapService | ||
|
||
|
||
class SeosnapState: | ||
website: dict | ||
website_id: int | ||
follow_next: bool | ||
recache: bool | ||
use_queue: bool | ||
load: bool | ||
|
||
cacheserver_url: str | ||
service: SeosnapService | ||
extract_fields: Dict[str, str] | ||
|
||
def __init__(self, website_id, follow_next=True, recache=True, use_queue=False, load=False) -> None: | ||
self.service = SeosnapService() | ||
self.website_id = website_id | ||
self.use_queue = parse_bool(use_queue) | ||
self.load = parse_bool(load) | ||
self.follow_next = parse_bool(follow_next) and not self.use_queue and not self.load | ||
self.recache = parse_bool(recache) and not self.load | ||
|
||
self.cacheserver_url = os.getenv('CACHEWARMER_CACHE_SERVER_URL').rstrip('/') | ||
self.website = self.service.get_website(self.website_id) | ||
self.extract_fields = {field['name']: field["css_selector"] for field in self.website["extract_fields"]} | ||
|
||
def get_name(self) -> str: | ||
return f'Cachewarm: {self.website["name"]}' | ||
|
||
def sitemap_urls(self) -> Iterable[str]: | ||
if not self.use_queue: | ||
yield self.website["sitemap"] | ||
|
||
def extra_pages(self) -> Iterable[str]: | ||
if not self.use_queue: | ||
yield self.website["domain"] | ||
else: | ||
for url in self.get_queue(): yield url | ||
|
||
def get_queue(self) -> Iterable[str]: | ||
# Retrieve queue items while queue is not empty | ||
uri = urllib.urlparse(self.website['domain']) | ||
root_domain = f'{uri.scheme}://{uri.netloc}' | ||
while True: | ||
items = self.service.get_queue(self.website_id) | ||
# Empty queue | ||
if len(items) == 0: break | ||
|
||
for item in items: | ||
path = item['page']['address'] | ||
yield f'{root_domain}{path}' | ||
|
||
def get_load_urls(self) -> Iterable[str]: | ||
uri = urllib.urlparse(self.website['domain']) | ||
root_domain = f'{uri.scheme}://{uri.netloc}' | ||
response = requests.get(urllib.urljoin(self.cacheserver_url, f'/list/{root_domain}')) | ||
if response.status_code // 200 != 1: yield | ||
|
||
urls = response.text.splitlines() | ||
for url in urls: yield url | ||
|
||
|
||
def parse_bool(s: Union[str, bool]) -> bool: | ||
if isinstance(s, bool): return s | ||
return s not in ['false', 'False', '0'] |