Merge pull request #8 from experius/feature/PWAI-460

Feature/pwai 460
experius · Jun 13, 2022 · 720af7e · 720af7e
2 parents 386f777 + 780db90
commit 720af7e
Show file tree

Hide file tree

Showing 9 changed files with 213 additions and 11 deletions.
diff --git a/.env.example b/.env.example
@@ -3,7 +3,7 @@ API_VER=v1
 API_NAME=snaptron
 API_PASS=snaptron
 
-CACHEWARMER_CACHE_SERVER_URL=http://rendertron:3000/render
+CACHEWARMER_CACHE_SERVER_URL=http://rendertron:3000/seo-snap
 CACHEWARMER_USER_AGENT=Seosnap
 CACHEWARMER_THREADS=2
 CACHEWARMER_CONCURRENT_REQUESTS=16

diff --git a/crawl.py b/crawl.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 import os, sys, logging
 
+import requests
 import click
 from functools import reduce
 from dotenv import load_dotenv
@@ -11,6 +12,10 @@
 
 from seosnap_cachewarmer.service import SeosnapService
 from seosnap_cachewarmer.spider import SeosnapSpider
+# from seosnap_cachewarmer.tagSpider import SeosnapTagSpider
+from seosnap_cachewarmer.state import SeosnapState
+from seosnap_cachewarmer.sitemap import SeoSnapSitemapRefresher
+import xml.etree.ElementTree as ET
 
 load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '.env'))
 configure_logging(install_root_handler=False)
@@ -34,6 +39,31 @@ def load(website_ids: str, **args):
         execute(argv=[sys.argv[0], 'crawl', 'Seosnap', '-a', f'website_id={website_id}'] + arg_tokens)
 
 
+# @cli.command()
+# @click.argument('website_ids')
+# @click.argument('tag')
+# def tag(website_ids: str, tag: str, **args):
+#     print("-- start recacheByTag --")
+#     print("test1 ckasdhlasd")
+#     print(website_ids)
+#     print(tag)
+#
+#     try:
+#         settings = get_project_settings()
+#         process = CrawlerProcess(settings)
+#         for website_id in website_ids.split(','):
+#             process.crawl(
+#                 SeosnapTagSpider,
+#                 website_id=website_id,
+#                 tag=tag,
+#                 **args
+#             )
+#         process.start()
+#     except Exception as e:
+#         click.echo(str(e), err=True)
+
+
+
 @cli.command()
 @click.argument('website_ids')
 @click.option('--follow_next', type=bool, default=True, help='Follows rel-next links if enabled')
@@ -56,6 +86,24 @@ def cache(website_ids: str, **args):
         click.echo(str(e), err=True)
 
 
+@cli.command()
+@click.argument('website_ids')
+def sync(website_ids: str, *args, **kwargs):
+    print('Start sync')
+    for website_id in website_ids.split(','):
+        service = SeosnapService()
+        service.sync_pages(website_id)
+
+
+@cli.command()
+@click.argument('website_ids')
+def redooldqueue(website_ids: str, *args, **kwargs):
+    print('Start redo')
+    for website_id in website_ids.split(','):
+        service = SeosnapService()
+        service.queue_old_redo(website_id)
+
+
 @cli.command()
 @click.argument('website_ids')
 def clean(website_ids: str):

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -3,7 +3,7 @@ services:
   cachewarmer:
     build:
       context: .
-    image: experiusnl/seosnap-cachewarmer:${TAG}
+    image: experiusnl/seosnap-cachewarmer
     container_name: seosnap_cachewarmer
     restart: "no"
     environment:

diff --git a/seosnap_cachewarmer/exporter.py b/seosnap_cachewarmer/exporter.py
@@ -11,7 +11,7 @@ class SeosnapItemExporter(BaseItemExporter):
     service: SeosnapService
     use_queue: bool
     buffer: List[dict]
-    buffer_size: int = int(os.getenv('CACHEWARMER_BUFFER_SIZE', 50))
+    buffer_size: int = int(os.getenv('CACHEWARMER_BUFFER_SIZE', 2))
 
     def __init__(self, website_id, use_queue=False, buffer_size=None, **kwargs):
         super().__init__(**kwargs)

diff --git a/seosnap_cachewarmer/middleware.py b/seosnap_cachewarmer/middleware.py
@@ -14,18 +14,58 @@ class CacheServerMiddleware(object):
     def process_request(self, request: Request, spider: SeosnapSpider):
         state = spider.state
 
+        parsed_url = urllib.urlparse(request.url)
+
+        print(" process_request ???")
+        print(" 1 < -")
+        print(parsed_url.path)
+        print(" 1.5 < -")
+        print(parsed_url.geturl())
+        print(" 1.7 < -")
+        print(parsed_url.hostname)
+        print(" 1.8 < -")
+        print(parsed_url.path)
+        print(" 1.9 < -")
+        print(parsed_url.netloc)
+        print(" 1.95 < -")
+        print(parsed_url.scheme)
+        print(" 2 < -")
+        print(parsed_url.query)
+        print(" 3 < -")
+        print(urllib.parse_qs(parsed_url.query))
+        print(" 4 < -")
+
         if CACHE_REQUEST_FLAG not in request.meta \
                 and request.url not in state.sitemap_urls() \
                 and not request.url.endswith('.xml') \
                 and state.cacheserver_url:
             request.meta[CACHE_REQUEST_FLAG] = True
 
             # Quote the request params as required by rendertron
-            url = urllib.quote(request.url, safe='/:')
+            url = request.url
+
             # Add mobile param if we are rendering mobile pages
             params = {}
             if state.recache: params['refreshCache'] = 'true'
-            if state.mobile: params['mobile'] = 1
+
+            print("test")
+            if 'mobile' in urllib.parse_qs(parsed_url.query) and urllib.parse_qs(parsed_url.query)['mobile']:
+                print("test2")
+                params['mobile'] = 1
+                print(url)
+                new_url = urllib.urlparse(url)
+                query = urllib.parse_qs(new_url.query)
+                print(query)
+                query.pop('mobile', None)
+                new_url = new_url._replace(query=urllib.urlencode(query, True))
+                print(new_url)
+                url = new_url.geturl()
+                print("xasdaskdsa")
+                print(url)
+
+            print("test3")
+
+            url = urllib.quote(url, safe='/:')
 
             return request.replace(
                 url=f'{state.cacheserver_url}/{url}?{urllib.urlencode(params)}',

diff --git a/seosnap_cachewarmer/service.py b/seosnap_cachewarmer/service.py
@@ -53,5 +53,14 @@ def report_errors(self, website_id: int, errors: List[dict]) -> List[dict]:
         params = {"version": os.getenv('API_VER'), "website_id": website_id, "errors": errors}
         return self.client.action(self.schema, action, params=params)
 
+    def sync_pages(self, website_id: int):
+        action = ['api', 'websites', 'pages', 'sync']
+        params = {"version": os.getenv('API_VER'), "website_id": website_id}
+        return self.client.action(self.schema, action, params=params)
+
+    def queue_old_redo(self, website_id: int):
+        action = ['api', 'websites', 'queue', 'redo', 'old']
+        params = {"version": os.getenv('API_VER'), "website_id": website_id}
+        return self.client.action(self.schema, action, params=params)
 
 
diff --git a/seosnap_cachewarmer/sitemap.py b/seosnap_cachewarmer/sitemap.py
@@ -0,0 +1,60 @@
+from scrapy.http import Response
+from scrapy.spiders import SitemapSpider
+from seosnap_cachewarmer.state import SeosnapState
+import itertools
+from scrapy.http import Request, XmlResponse
+
+
+class SeoSnapSitemapRefresherTest(SitemapSpider):
+    state: SeosnapState
+    name = 'Seosnap'
+    sitemap_urls = ['http://192.168.128.5/pub/sitemap/sitemap.xml']
+
+    def __init__(self, *args, **kwargs) -> None:
+        print(' -------------- sitemap init!!! ----------- ')
+        self.state = SeosnapState(*args, **kwargs)
+        self.name = self.state.get_name()
+
+        print(self.state.sitemap_urls())
+        for uri in self.state.sitemap_urls():
+            print(uri)
+
+        super().__init__(sitemap_urls=self.state.sitemap_urls())
+
+    def headers(self):
+        return {}
+
+    def parse(self, response):
+        print("parse")
+        print("parse")
+        print("parse")
+        print("parse")
+        print("parse")
+        print(response.url)
+
+        # yield {
+        #     'url': response.url
+        # }
+
+
+class SeoSnapSitemapRefresher(SitemapSpider):
+    name = 'test'
+    state: SeosnapState
+
+    def __init__(self, *args, **kwargs) -> None:
+        print('>>>>>>----- start sitemap ------<<<<<')
+        self.state = SeosnapState(*args, **kwargs)
+        super().__init__(sitemap_urls=self.state.sitemap_urls())
+        # self.state = state
+
+    def parse(self, response):
+        print('test')
+        print(response)
+
+        yield response
+
+    def get_urls(self):
+        print('GET get_urls')
+
+        for url in self.state.sitemap_urls():
+            yield Request(url, self.parse)
diff --git a/seosnap_cachewarmer/spider.py b/seosnap_cachewarmer/spider.py
@@ -4,10 +4,12 @@
 import urllib.parse as urllib
 from datetime import datetime
 from typing import Dict, List
+import json
 
 from scrapy import Request
 from scrapy.http import Response
 from scrapy.spiders import SitemapSpider
+from scrapy.selector import Selector
 
 from seosnap_cachewarmer.state import SeosnapState
 
@@ -17,42 +19,85 @@ class SeosnapSpider(SitemapSpider):
     name = 'Seosnap'
 
     def __init__(self, *args, **kwargs) -> None:
+        print(' ----- __init__ -- ')
         self.state = SeosnapState(*args, **kwargs)
         self.name = self.state.get_name()
+
+        print(self.state.sitemap_urls())
+        print(' --- test  xxx --')
+
         super().__init__(sitemap_urls=self.state.sitemap_urls())
 
     def headers(self):
         return {}
 
     def start_requests(self):
+        print(' ----- start request -- ')
+
         extra_urls = (Request(url, self.parse, headers=self.headers()) for url in self.state.extra_pages())
+
+        print('extra urls ' + extra_urls.__str__())
+
         return itertools.chain(extra_urls, super().start_requests())
 
     def parse(self, response: Response):
+        print(' ----- parse -- ')
+        print(response.url)
+        response_body_json = json.loads(response.body)
+        # print(response.body)
+
         data = {
-            name: response.css(selector).extract_first()
+            name: Selector(text=response_body_json['html']).css(selector).extract_first()
             for name, selector in self.state.extract_fields.items()
         }
 
         # Follow next links
+        print("test <--------------")
+        print(self.state.follow_next)
+
         if self.state.follow_next:
-            rel_next_url = response.css('link[rel="next"]::attr(href), a[rel="next"]::attr(href)').extract_first()
+            rel_next_url = Selector(text=response_body_json['html']).css('link[rel="next"]::attr(href), a[rel="next"]::attr(href)').extract_first()
+
+            print(' ----- NEXT url -- ')
+            print(rel_next_url)
+
             if rel_next_url is not None:
-                data['rel_next_url'] = rel_next_url
-                yield response.follow(rel_next_url, callback=self.parse)
+                rel_next_url = urllib.urlparse(rel_next_url)
+                rel_next_url_query = urllib.parse_qs(rel_next_url.query)
+                rel_next_url_query.pop('refreshCache', None)
+
+                old_url_parsed = urllib.urlparse(response.url)
+                old_url_query = urllib.parse_qs(old_url_parsed.query)
+
+                if 'mobile' in old_url_query and old_url_query['mobile']:
+                    rel_next_url_query.update({'mobile': '1'})
+
+                rel_next_url = rel_next_url._replace(query=urllib.urlencode(rel_next_url_query, True))
+
+                data['rel_next_url'] = rel_next_url.geturl()
+                yield response.follow(rel_next_url.geturl(), callback=self.parse)
 
         # Strip cacheserver from the url if possible
         url = response.url[len(self.state.cacheserver_url):].lstrip('/')
         url = urllib.urlparse(url)
+        query = urllib.parse_qs(url.query)
+        query.pop('refreshCache', None)
+        url = url._replace(query=urllib.urlencode(query, True))
+
         url = urllib.urlunparse(('', '', url.path, url.params, url.query, ''))
 
         # Build page entity for dashboard
         cached = bytes_to_str(response.headers.get('Rendertron-Cached', None))
         cached_at = bytes_to_str(response.headers.get('Rendertron-Cached-At', None))
+
+        print(' ----- CHECKKK -- ')
+        print(url)
+
         yield {
             'address': url,
-            'content_type': bytes_to_str(response.headers.get('Content-Type', None)),
+            'content_type': 'text/html; charset=utf-8',
             'status_code': response.status,
+            'x_magento_tags': response_body_json['tags'],
             'cache_status': 'cached' if cached == '1' or response.status == 200 else 'not-cached',
             'cached_at': cached_at,
             'extract_fields': data

diff --git a/seosnap_cachewarmer/state.py b/seosnap_cachewarmer/state.py
@@ -37,7 +37,7 @@ def __init__(
         self.website_id = website_id
         self.use_queue = parse_bool(use_queue)
         self.clean_old_pages_after = parse_bool(clean_old_pages_after)
-        self.follow_next = parse_bool(follow_next) and not self.use_queue
+        self.follow_next = parse_bool(follow_next)
         self.recache = parse_bool(recache)
         self.mobile = parse_bool(mobile)
         self.errors = []