Skip to content

Commit

Permalink
Merge pull request #8 from experius/feature/PWAI-460
Browse files Browse the repository at this point in the history
Feature/pwai 460
  • Loading branch information
lewisvoncken authored Jun 13, 2022
2 parents 386f777 + 780db90 commit 720af7e
Show file tree
Hide file tree
Showing 9 changed files with 213 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ API_VER=v1
API_NAME=snaptron
API_PASS=snaptron

CACHEWARMER_CACHE_SERVER_URL=http://rendertron:3000/render
CACHEWARMER_CACHE_SERVER_URL=http://rendertron:3000/seo-snap
CACHEWARMER_USER_AGENT=Seosnap
CACHEWARMER_THREADS=2
CACHEWARMER_CONCURRENT_REQUESTS=16
Expand Down
48 changes: 48 additions & 0 deletions crawl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/python
import os, sys, logging

import requests
import click
from functools import reduce
from dotenv import load_dotenv
Expand All @@ -11,6 +12,10 @@

from seosnap_cachewarmer.service import SeosnapService
from seosnap_cachewarmer.spider import SeosnapSpider
# from seosnap_cachewarmer.tagSpider import SeosnapTagSpider
from seosnap_cachewarmer.state import SeosnapState
from seosnap_cachewarmer.sitemap import SeoSnapSitemapRefresher
import xml.etree.ElementTree as ET

load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '.env'))
configure_logging(install_root_handler=False)
Expand All @@ -34,6 +39,31 @@ def load(website_ids: str, **args):
execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)


# @cli.command()
# @click.argument('website_ids')
# @click.argument('tag')
# def tag(website_ids: str, tag: str, **args):
# print("-- start recacheByTag --")
# print("test1 ckasdhlasd")
# print(website_ids)
# print(tag)
#
# try:
# settings = get_project_settings()
# process = CrawlerProcess(settings)
# for website_id in website_ids.split(','):
# process.crawl(
# SeosnapTagSpider,
# website_id=website_id,
# tag=tag,
# **args
# )
# process.start()
# except Exception as e:
# click.echo(str(e), err=True)



@cli.command()
@click.argument('website_ids')
@click.option('--follow_next', type=bool, default=True, help='Follows rel-next links if enabled')
Expand All @@ -56,6 +86,24 @@ def cache(website_ids: str, **args):
click.echo(str(e), err=True)


@cli.command()
@click.argument('website_ids')
def sync(website_ids: str, *args, **kwargs):
print('Start sync')
for website_id in website_ids.split(','):
service = SeosnapService()
service.sync_pages(website_id)


@cli.command()
@click.argument('website_ids')
def redooldqueue(website_ids: str, *args, **kwargs):
print('Start redo')
for website_id in website_ids.split(','):
service = SeosnapService()
service.queue_old_redo(website_id)


@cli.command()
@click.argument('website_ids')
def clean(website_ids: str):
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ services:
cachewarmer:
build:
context: .
image: experiusnl/seosnap-cachewarmer:${TAG}
image: experiusnl/seosnap-cachewarmer
container_name: seosnap_cachewarmer
restart: "no"
environment:
Expand Down
2 changes: 1 addition & 1 deletion seosnap_cachewarmer/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class SeosnapItemExporter(BaseItemExporter):
service: SeosnapService
use_queue: bool
buffer: List[dict]
buffer_size: int = int(os.getenv('CACHEWARMER_BUFFER_SIZE', 50))
buffer_size: int = int(os.getenv('CACHEWARMER_BUFFER_SIZE', 2))

def __init__(self, website_id, use_queue=False, buffer_size=None, **kwargs):
super().__init__(**kwargs)
Expand Down
44 changes: 42 additions & 2 deletions seosnap_cachewarmer/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,58 @@ class CacheServerMiddleware(object):
def process_request(self, request: Request, spider: SeosnapSpider):
state = spider.state

parsed_url = urllib.urlparse(request.url)

print(" process_request ???")
print(" 1 < -")
print(parsed_url.path)
print(" 1.5 < -")
print(parsed_url.geturl())
print(" 1.7 < -")
print(parsed_url.hostname)
print(" 1.8 < -")
print(parsed_url.path)
print(" 1.9 < -")
print(parsed_url.netloc)
print(" 1.95 < -")
print(parsed_url.scheme)
print(" 2 < -")
print(parsed_url.query)
print(" 3 < -")
print(urllib.parse_qs(parsed_url.query))
print(" 4 < -")

if CACHE_REQUEST_FLAG not in request.meta \
and request.url not in state.sitemap_urls() \
and not request.url.endswith('.xml') \
and state.cacheserver_url:
request.meta[CACHE_REQUEST_FLAG] = True

# Quote the request params as required by rendertron
url = urllib.quote(request.url, safe='/:')
url = request.url

# Add mobile param if we are rendering mobile pages
params = {}
if state.recache: params['refreshCache'] = 'true'
if state.mobile: params['mobile'] = 1

print("test")
if 'mobile' in urllib.parse_qs(parsed_url.query) and urllib.parse_qs(parsed_url.query)['mobile']:
print("test2")
params['mobile'] = 1
print(url)
new_url = urllib.urlparse(url)
query = urllib.parse_qs(new_url.query)
print(query)
query.pop('mobile', None)
new_url = new_url._replace(query=urllib.urlencode(query, True))
print(new_url)
url = new_url.geturl()
print("xasdaskdsa")
print(url)

print("test3")

url = urllib.quote(url, safe='/:')

return request.replace(
url=f'{state.cacheserver_url}/{url}?{urllib.urlencode(params)}',
Expand Down
9 changes: 9 additions & 0 deletions seosnap_cachewarmer/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,14 @@ def report_errors(self, website_id: int, errors: List[dict]) -> List[dict]:
params = {"version": os.getenv('API_VER'), "website_id": website_id, "errors": errors}
return self.client.action(self.schema, action, params=params)

def sync_pages(self, website_id: int):
action = ['api', 'websites', 'pages', 'sync']
params = {"version": os.getenv('API_VER'), "website_id": website_id}
return self.client.action(self.schema, action, params=params)

def queue_old_redo(self, website_id: int):
action = ['api', 'websites', 'queue', 'redo', 'old']
params = {"version": os.getenv('API_VER'), "website_id": website_id}
return self.client.action(self.schema, action, params=params)


60 changes: 60 additions & 0 deletions seosnap_cachewarmer/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from scrapy.http import Response
from scrapy.spiders import SitemapSpider
from seosnap_cachewarmer.state import SeosnapState
import itertools
from scrapy.http import Request, XmlResponse


class SeoSnapSitemapRefresherTest(SitemapSpider):
state: SeosnapState
name = 'Seosnap'
sitemap_urls = ['http://192.168.128.5/pub/sitemap/sitemap.xml']

def __init__(self, *args, **kwargs) -> None:
print(' -------------- sitemap init!!! ----------- ')
self.state = SeosnapState(*args, **kwargs)
self.name = self.state.get_name()

print(self.state.sitemap_urls())
for uri in self.state.sitemap_urls():
print(uri)

super().__init__(sitemap_urls=self.state.sitemap_urls())

def headers(self):
return {}

def parse(self, response):
print("parse")
print("parse")
print("parse")
print("parse")
print("parse")
print(response.url)

# yield {
# 'url': response.url
# }


class SeoSnapSitemapRefresher(SitemapSpider):
name = 'test'
state: SeosnapState

def __init__(self, *args, **kwargs) -> None:
print('>>>>>>----- start sitemap ------<<<<<')
self.state = SeosnapState(*args, **kwargs)
super().__init__(sitemap_urls=self.state.sitemap_urls())
# self.state = state

def parse(self, response):
print('test')
print(response)

yield response

def get_urls(self):
print('GET get_urls')

for url in self.state.sitemap_urls():
yield Request(url, self.parse)
55 changes: 50 additions & 5 deletions seosnap_cachewarmer/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import urllib.parse as urllib
from datetime import datetime
from typing import Dict, List
import json

from scrapy import Request
from scrapy.http import Response
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector

from seosnap_cachewarmer.state import SeosnapState

Expand All @@ -17,42 +19,85 @@ class SeosnapSpider(SitemapSpider):
name = 'Seosnap'

def __init__(self, *args, **kwargs) -> None:
print(' ----- __init__ -- ')
self.state = SeosnapState(*args, **kwargs)
self.name = self.state.get_name()

print(self.state.sitemap_urls())
print(' --- test xxx --')

super().__init__(sitemap_urls=self.state.sitemap_urls())

def headers(self):
return {}

def start_requests(self):
print(' ----- start request -- ')

extra_urls = (Request(url, self.parse, headers=self.headers()) for url in self.state.extra_pages())

print('extra urls ' + extra_urls.__str__())

return itertools.chain(extra_urls, super().start_requests())

def parse(self, response: Response):
print(' ----- parse -- ')
print(response.url)
response_body_json = json.loads(response.body)
# print(response.body)

data = {
name: response.css(selector).extract_first()
name: Selector(text=response_body_json['html']).css(selector).extract_first()
for name, selector in self.state.extract_fields.items()
}

# Follow next links
print("test <--------------")
print(self.state.follow_next)

if self.state.follow_next:
rel_next_url = response.css('link[rel="next"]::attr(href), a[rel="next"]::attr(href)').extract_first()
rel_next_url = Selector(text=response_body_json['html']).css('link[rel="next"]::attr(href), a[rel="next"]::attr(href)').extract_first()

print(' ----- NEXT url -- ')
print(rel_next_url)

if rel_next_url is not None:
data['rel_next_url'] = rel_next_url
yield response.follow(rel_next_url, callback=self.parse)
rel_next_url = urllib.urlparse(rel_next_url)
rel_next_url_query = urllib.parse_qs(rel_next_url.query)
rel_next_url_query.pop('refreshCache', None)

old_url_parsed = urllib.urlparse(response.url)
old_url_query = urllib.parse_qs(old_url_parsed.query)

if 'mobile' in old_url_query and old_url_query['mobile']:
rel_next_url_query.update({'mobile': '1'})

rel_next_url = rel_next_url._replace(query=urllib.urlencode(rel_next_url_query, True))

data['rel_next_url'] = rel_next_url.geturl()
yield response.follow(rel_next_url.geturl(), callback=self.parse)

# Strip cacheserver from the url if possible
url = response.url[len(self.state.cacheserver_url):].lstrip('/')
url = urllib.urlparse(url)
query = urllib.parse_qs(url.query)
query.pop('refreshCache', None)
url = url._replace(query=urllib.urlencode(query, True))

url = urllib.urlunparse(('', '', url.path, url.params, url.query, ''))

# Build page entity for dashboard
cached = bytes_to_str(response.headers.get('Rendertron-Cached', None))
cached_at = bytes_to_str(response.headers.get('Rendertron-Cached-At', None))

print(' ----- CHECKKK -- ')
print(url)

yield {
'address': url,
'content_type': bytes_to_str(response.headers.get('Content-Type', None)),
'content_type': 'text/html; charset=utf-8',
'status_code': response.status,
'x_magento_tags': response_body_json['tags'],
'cache_status': 'cached' if cached == '1' or response.status == 200 else 'not-cached',
'cached_at': cached_at,
'extract_fields': data
Expand Down
2 changes: 1 addition & 1 deletion seosnap_cachewarmer/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
self.website_id = website_id
self.use_queue = parse_bool(use_queue)
self.clean_old_pages_after = parse_bool(clean_old_pages_after)
self.follow_next = parse_bool(follow_next) and not self.use_queue
self.follow_next = parse_bool(follow_next)
self.recache = parse_bool(recache)
self.mobile = parse_bool(mobile)
self.errors = []
Expand Down

0 comments on commit 720af7e

Please sign in to comment.