From 78c375dbc62164f53b236f55698edb5341b87fbe Mon Sep 17 00:00:00 2001
From: unknown <ilyas2707@yandex.ru>
Date: Thu, 28 Sep 2023 19:59:37 +0300
Subject: [PATCH] parse and save data, tests

---
 .gitignore                                    |   2 +-
 docs/conf.py                                  |   8 +-
 requirements.txt                              |   9 +-
 tests/test_wbparser.py                        | 101 +++++++---
 wbparser/main.py                              | 184 ++++++++++++++++--
 wbparser/wildberries/wildberries/items.py     |  16 +-
 .../wildberries/wildberries/spiders/goods.py  |  97 +++------
 7 files changed, 277 insertions(+), 140 deletions(-)

diff --git a/.gitignore b/.gitignore
index 68bc17f..2dc53ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
diff --git a/docs/conf.py b/docs/conf.py
index 0b88c89..5b844d1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -11,15 +11,13 @@
 sys.path.insert(0, os.path.abspath('..'))
 
 project = 'WBparser'
-copyright = '2023, Maxim Liksakov'
+copyright = '2023, Maxim Liksakov, Ilyas Gasanov'
 author = 'Maxim Liksakov, Ilyas Gasanov'
 release = '0.1.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-extensions = []
-
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
@@ -28,12 +26,12 @@
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 html_static_path = ['_static']
 
 extensions = [
     'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon'
+    'sphinx.ext.napoleon',
 ]
 
 master_doc = 'index'
diff --git a/requirements.txt b/requirements.txt
index f7a53df..7e900f3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
-scrapy
-sphinx
-twine
-pytest
\ No newline at end of file
+scrapy==2.11.0
+sphinx==7.2.6
+sphinx_rtd_theme==1.3.0
+twine==4.0.2
+pytest==7.4.2
\ No newline at end of file
diff --git a/tests/test_wbparser.py b/tests/test_wbparser.py
index c3db4f1..3f80373 100644
--- a/tests/test_wbparser.py
+++ b/tests/test_wbparser.py
@@ -1,28 +1,81 @@
-import pytest
-from unittest.mock import Mock
+import csv
+import json
+import os
+from os import listdir
+from os.path import join, isfile
 
 from wbparser.main import WBparser
 
 
-@pytest.fixture
-def mock_parser():
-    parser = WBparser('11152183')
-    parser.result = Mock(return_value={'brand': 'O`SHADE',
-                                            'colors': [{'id': 0, 'name': 'черный'}],
-                                            'diffPrice': False,
-                                            'id': '11152183',
-                                            'name': 'Ботинки женские натуральная кожа осенние',
-                                            'pics': 16,
-                                            'priceU': 757400,
-                                            'qty': 1012,
-                                            'questions': 1940,
-                                            'rating': 4.7,
-                                            'salePriceU': 560400,
-                                            'sizes': ['36', '37', '38', '39', '40', '41', '42', '43']})
-    return parser
-
-
-def test_parse_data(mock_parser):
-    result = mock_parser.parse_data()
-    assert result is not None
-    assert 'id', 'name' in result
+parser = WBparser()
+parser.parse_data(ids=[11152183, 87628789], urls=['https://www.wildberries.ru/catalog/155761175/detail.aspx'])
+
+
+mock_result = [
+    {
+        'id': '11152183',
+        'name': 'Ботинки женские демисезонные натуральная кожа на шнуровке',
+        'brand': 'O`SHADE',
+        'priceU': 7574,
+        'salePriceU': 5529,
+        'picsAmt': 16,
+        'colors': [{'name': 'черный', 'id': 0}],
+        'sizes': ['36', '37', '38', '39', '40', '41', '42', '43'],
+        'qty': 566,
+        'rating': 4.7,
+        'feedbacksAmt': 2055,
+        'supplierId': 18744
+    }
+]
+
+
+def test_parse_data_by_ids():
+
+    assert parser.result is not None
+    assert 11152183 in parser.ids
+    assert 87628789 in parser.ids
+    assert 'O`SHADE' in parser.brand
+    assert 'T.TACCARDI' in parser.brand
+
+
+def test_parse_data_by_urls():
+
+    assert parser.result is not None
+    assert '155761175' in parser.ids
+    assert 'Avanti' in parser.brand
+
+
+def test_save_data_to_json():
+
+    mock_parser = WBparser()
+    mock_parser.result = mock_result
+
+    mock_parser.save_data(file_name='testfileJSON')
+
+    assert 'testfileJSON.json' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]
+
+    with open('testfileJSON.json', 'r') as f:
+        data = json.loads(''.join(f.readlines()))[0]
+
+    assert data['id'] == '11152183'
+    assert data['rating'] == 4.7
+    assert data['sizes'] == ['36', '37', '38', '39', '40', '41', '42', '43']
+    assert data['colors'] == [{'name': 'черный', 'id': 0}]
+
+
+def test_save_data_to_csv():
+
+    mock_parser = WBparser()
+    mock_parser.result = mock_result
+
+    mock_parser.save_data(file_name='testfileCSV', file_format='csv')
+
+    assert 'testfileCSV.csv' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]
+
+    with open('testfileCSV.csv', mode='r') as f:
+        csv_reader = csv.DictReader(f)
+        data = [row for row in csv_reader][0]
+
+    assert data['id'] == '11152183'
+    assert data['rating'] == '4.7'
+    assert data['sizes'] == "['36', '37', '38', '39', '40', '41', '42', '43']"
diff --git a/wbparser/main.py b/wbparser/main.py
index a79933e..d305386 100644
--- a/wbparser/main.py
+++ b/wbparser/main.py
@@ -1,45 +1,187 @@
+import csv
+import json
 from dataclasses import dataclass
+from typing import Union
 
-from scrapy.crawler import CrawlerProcess, CrawlerRunner
-from twisted.internet import reactor, defer
+from scrapy.crawler import CrawlerRunner
+from twisted.internet import reactor
 
 from wbparser.wildberries.wildberries.spiders.goods import GoodsSpider
 
 
 @dataclass
 class WBparser:
-    """
-    A class used to parse data from Wildberries.
+    """A class used to parse data from Wildberries.
+    Pass several identifiers/urls to one WBparser object to make sure that Scrapy will work asynchronously.
 
     Example:
+    .. code-block:: python
+        parser = WBparser()
+        parser.parse_data(ids=[<product_id_1>])
+        parser.save_data()
 
+    Example:
     .. code:: python
-
-            parser = WBparser(<product_id>)
-            parser.parse_data()
-            print(parser.result)
-
+        parser = WBparser()
+        parser.parse_data(urls=[<url_1>, <url_2>, <url_3>,], ids=[<product_id_1>, <product_id_2>,])
+        parser.save_data(file_format='json', file_name='rareProducts')
+        print(parser.result)
+        print(parser.name)
     """
 
-    id: int | str
-    result: dict = None
+    ids: Union[None, list[int], list[str]] = None
+    urls: Union[None, list[str]] = None
+    result: Union[None, list[dict]] = None
 
     def _crawl(self, runner):
-        deferred = runner.crawl(GoodsSpider, id=self.id, wb_parser=self)
+        deferred = runner.crawl(GoodsSpider, wb_parser=self)
         deferred.addBoth(lambda _: reactor.stop())
         return deferred
 
-    def parse_data(self) -> dict:
-        """
-
-        Parses the data for the product id and updates the result attribute.
+    def parse_data(
+            self,
+            urls: Union[None, list[str]] = None,
+            ids: Union[None, list[int], list[str]] = None
+    ) -> list[dict]:
+        """Parses the data for the products identifiers or/and products pages. The data goes to the result attribute.
 
-        Returns
-        -------
-        dict
-            The parsed data.
+        :param urls: Product pages to parse, defaults to None
+        :type urls: str, optional
+        :param ids: Products identifiers, defaults to None
+        :type ids: str, optional
+        :return: List of dicts, each of which contains info about parsed WB items
+        :rtype: list[dict]
         """
+        self.urls = urls
+        self.ids = ids
         runner = CrawlerRunner()
         reactor.callLater(0, self._crawl, runner)
         reactor.run()
-        return self.result
\ No newline at end of file
+        return self.result
+
+    def save_data(self, file_format='json', file_name='goods', encoding='utf-8') -> None:
+        """Creates a file with the selected format and saves the parsing result to it.
+
+        :param file_format: One of two possible formats (json or csv), defaults to json
+        :type file_format: str, optional
+        :param file_name: File name, defaults to goods
+        :type file_name: str, optional
+        :param encoding: Encoding type, defaults to utf-8
+        :type encoding: str, optional
+        :return: Nothing is returned
+        :rtype: None
+        """
+        if not file_name.isalnum():
+            raise ValueError('Only letters and numbers are allowed in the filename')
+
+        if self.result:
+
+            if file_format == 'json':
+                with open(f'{file_name}.json', 'w', encoding=encoding) as f:
+                    f.write(json.dumps(self.result, indent=0))
+
+            elif file_format == 'csv':
+                keys = self.result[0].keys()
+                with open(f'{file_name}.csv', 'w', newline='', encoding=encoding) as f:
+                    dict_writer = csv.DictWriter(f, keys)
+                    dict_writer.writeheader()
+                    dict_writer.writerows(self.result)
+
+    @property
+    def name(self) -> list[str]:
+        """Accesses the product name property.
+
+        :return: Product names of parsed items
+        :rtype: list[str]
+        """
+        return [dct.get('name') for dct in self.result]
+
+    @property
+    def brand(self) -> list[str]:
+        """Accesses the brand property.
+
+        :return: Brands of parsed items
+        :rtype: list[str]
+        """
+        return [dct.get('brand') for dct in self.result]
+
+    @property
+    def priceU(self) -> list[int]:
+        """Accesses the product price property.
+
+        :return: Prices of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('priceU') for dct in self.result]
+
+    @property
+    def salePriceU(self) -> list[int]:
+        """Accesses the sale price property.
+
+        :return: Sale prices of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('salePriceU') for dct in self.result]
+
+    @property
+    def picsAmt(self) -> list[int]:
+        """Accesses the pictures amount property.
+
+        :return: Picture amounts of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('picsAmt') for dct in self.result]
+
+    @property
+    def colors(self) -> list[list[dict]]:
+        """Accesses the color property.
+
+        :return: Colors of parsed items
+        :rtype: list[list[dict]]
+        """
+        return [dct.get('colors') for dct in self.result]
+
+    @property
+    def sizes(self) -> list[list[str]]:
+        """Accesses the size property.
+
+        :return: Size values of parsed items
+        :rtype: list[list[str]]
+        """
+        return [dct.get('sizes') for dct in self.result]
+
+    @property
+    def qty(self) -> list[int]:
+        """Accesses the qty property.
+
+        :return: Amount of available products for each parsed item
+        :rtype: list[int]
+        """
+        return [dct.get('qty') for dct in self.result]
+
+    @property
+    def supplierId(self) -> list[str]:
+        """Accesses the supplier identifier property.
+
+        :return: Supplier identifiers of parsed items
+        :rtype: list[str]
+        """
+        return [dct.get('supplierId') for dct in self.result]
+
+    @property
+    def rating(self) -> list[float]:
+        """Accesses the rating property.
+
+        :return: Rating values of parsed items
+        :rtype: list[float]
+        """
+        return [dct.get('rating') for dct in self.result]
+
+    @property
+    def feedbacksAmt(self) -> list[int]:
+        """Accesses the number of feedbacks property.
+
+        :return: Number of feedbacks of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('feedbacksAmt') for dct in self.result]
diff --git a/wbparser/wildberries/wildberries/items.py b/wbparser/wildberries/wildberries/items.py
index f9d2c1b..58d7499 100644
--- a/wbparser/wildberries/wildberries/items.py
+++ b/wbparser/wildberries/wildberries/items.py
@@ -3,24 +3,14 @@
 
 class WildberriesItem(scrapy.Item):
     id = scrapy.Field()
-    inn = scrapy.Field()
-    supplierId = scrapy.Field()
-    supplierName = scrapy.Field()
-    legalAddress = scrapy.Field()
     name = scrapy.Field()
     brand = scrapy.Field()
     priceU = scrapy.Field()
-    sale = scrapy.Field()
     salePriceU = scrapy.Field()
-    pics = scrapy.Field()
+    picsAmt = scrapy.Field()
     colors = scrapy.Field()
     sizes = scrapy.Field()
     qty = scrapy.Field()
-    diffPrice = scrapy.Field()
-    price_history = scrapy.Field()
+    supplierId = scrapy.Field()
     rating = scrapy.Field()
-    comments = scrapy.Field()
-    sold = scrapy.Field()
-    description = scrapy.Field()
-    questions = scrapy.Field()
-
+    feedbacksAmt = scrapy.Field()
diff --git a/wbparser/wildberries/wildberries/spiders/goods.py b/wbparser/wildberries/wildberries/spiders/goods.py
index 8ce6f1d..4c2e53d 100644
--- a/wbparser/wildberries/wildberries/spiders/goods.py
+++ b/wbparser/wildberries/wildberries/spiders/goods.py
@@ -1,6 +1,3 @@
-import json
-import re
-
 import scrapy
 
 from ..items import WildberriesItem
@@ -13,8 +10,6 @@
                '&stores=11767,117986,1733,686,132043' \
                '&pricemarginCoeff=1.0&reg=0&appType=1&offlineBonus=0&onlineBonus=0&emp=0&locale=ru&' \
                'lang=ru&curr=rub&couponsGeo=12,3,18,15,21&dest=-1029256,-102269,-2162196,-1257786&nm={}'
-# AJAX_REQUEST_PRICE_HISTORY = 'https://basket-01.wb.ru/vol{}/part{}/{}/info/price-history.json'
-# AJAX_REQUEST_SELLERS = 'https://basket-01.wb.ru/vol{}/part{}/{}/info/sellers.json'
 
 
 class GoodsSpider(scrapy.Spider):
@@ -23,92 +18,50 @@ class GoodsSpider(scrapy.Spider):
     allowed_domains = ['www.wildberries.ru', 'wbxcatalog-ru.wildberries.ru', 'wbx-content-v2.wbstatic.net', 'card.wb.ru',
                        'product-order-qnt.wildberries.ru', 'basket-01.wb.ru']
 
-    def __init__(self, id=None, wb_parser=None, *args, **kwargs):
+    def __init__(self, wb_parser=None, *args, **kwargs):
         super(GoodsSpider, self).__init__(*args, **kwargs)
-        self.id = id
         self.wb_parser = wb_parser
 
     def start_requests(self):
-        item = WildberriesItem()
-        item['id'] = self.id
-        yield scrapy.Request(url=AJAX_REQUEST__GOOD_INFO.format(self.id), callback=self.parse_good_info,
-                             cb_kwargs={'item': item, 'id': self.id})
 
-    # def parse_page(self, response, **kwargs):
-    #     item = kwargs['item']
-    #     id_ = kwargs['id']
-    #     if response.status in SUCCESS_CODES:
-    #         item['rating'] = response.xpath('//span[contains(@class, "product-review__rating")]//text()').get()
-    #         #item['comments'] = response.xpath("//span[@class='same-part-kt__count-review']/text()").get()[1:]
-    #         item['description'] = response.xpath('//*[@id="container"]'
-    #                                              '/div[3]/div[1]/section[3]/div[2]/div[1]/p/text()').get()
-    #         item['questions'] = response.xpath('/html/body/div[1]/main/div[2]/div/div[2]/'
-    #                                            'section[2]/div[2]/ul/li[2]/a/span/text()').get()
-    #         yield scrapy.Request(url=AJAX_REQUEST__GOOD_INFO.format(id_), callback=self.parse_good_info,
-    #                              cb_kwargs={'item': item, 'id': id_})
-    #     else:
-    #         yield item
+        if self.wb_parser.urls:
+            for url in self.wb_parser.urls:
+                self.wb_parser.ids = self.wb_parser.ids or []
+                self.wb_parser.ids.append(url.split('/')[-2])
+
+        if self.wb_parser.ids:
+            for id_to_crawl in self.wb_parser.ids:
+                item = WildberriesItem()
+                item['id'] = id_to_crawl
+                yield scrapy.Request(url=AJAX_REQUEST__GOOD_INFO.format(id_to_crawl), callback=self.parse_good_info,
+                                     cb_kwargs={'item': item, 'id': id_to_crawl})
 
     def parse_good_info(self, response, **kwargs):
+
         item = kwargs['item']
-        id_ = kwargs['id']
         if response.status in SUCCESS_CODES:
             data = response.json()
             item['name'] = data['data']['products'][0]['name']
             item['brand'] = data['data']['products'][0]['brand']
-            item['priceU'] = data['data']['products'][0]['priceU']
-            item['salePriceU'] = data['data']['products'][0]['salePriceU']
-            item['pics'] = data['data']['products'][0]['pics']
+            item['priceU'] = data['data']['products'][0]['priceU'] // 100
+            item['salePriceU'] = data['data']['products'][0]['salePriceU'] // 100
+            item['picsAmt'] = data['data']['products'][0]['pics']
             item['colors'] = data['data']['products'][0]['colors']
             item['sizes'] = []
             item['qty'] = 0
             item['rating'] = data['data']['products'][0]['reviewRating']
-            item['questions'] = data['data']['products'][0]["feedbacks"]
-            item['diffPrice'] = data['data']['products'][0]['diffPrice']
+            item['feedbacksAmt'] = data['data']['products'][0]["feedbacks"]
+            item['supplierId'] = data['data']['products'][0]['supplierId']
+
             for size in data['data']['products'][0]['sizes']:
                 item['sizes'].append(size['origName'])
                 for stock in size['stocks']:
                     item['qty'] += stock['qty']
-            # yield scrapy.Request(url=AJAX_REQUEST_SELLERS.format(id_[:3], id_[:5], id_), callback=self.parse_sellers_info,
-            #                      cb_kwargs={'item': item, 'id': id_})
-        if self.wb_parser:
-            self.wb_parser.result = item
-        yield item
-
-    # def parse_sellers_info(self, response, **kwargs):
-    #     item = kwargs['item']
-    #     id_ = kwargs['id']
-    #     if response.status in SUCCESS_CODES:
-    #         data = response.json()
-    #         try:
-    #             item['supplierId'] = data['supplierId']
-    #             item['supplierName'] = data['supplierName']
-    #             item['inn'] = data['inn']
-    #             item['legalAddress'] = data['legalAddress']
-    #         except KeyError:
-    #             pass
-    #         id_ = str(id_)
-    #         yield scrapy.Request(url=AJAX_REQUEST_PRICE_HISTORY.format(id_[:3], id_[:5], id_), callback=self.parse_history_info,
-    #                              cb_kwargs={'item': item, 'id': id_})
-    #     else:
-    #         yield item
 
+        if self.wb_parser:
+            if not self.wb_parser.result:
+                self.wb_parser.result = [dict(item)]
+            else:
+                self.wb_parser.result.append((dict(item)))
 
-    # def parse_history_info(self, response, **kwargs):
-    #     item = kwargs['item']
-    #     if response.status in SUCCESS_CODES:
-    #         data = response.json()
-    #         item['price_history'] = data
-    #         yield scrapy.Request(url=f'https://product-order-qnt.wildberries.ru/by-nm/?nm={kwargs.get("id")}',
-    #                              callback=self.parse_qnt_info, cb_kwargs={'item': item, 'id': kwargs.get("id")})
-    #     else:
-    #         yield scrapy.Request(url=f'https://product-order-qnt.wildberries.ru/by-nm/?nm={kwargs.get("id")}',
-    #                              callback=self.parse_qnt_info, cb_kwargs={'item': item, 'id': kwargs.get("id")})
-    #
-    # def parse_qnt_info(self, response, **kwargs):
-    #     item = kwargs['item']
-    #     if response.status in SUCCESS_CODES:
-    #         data = response.json()
-    #         qnt = data[0]['qnt']
-    #         item['qty'] = data[0]['qnt']
-    #     yield item
\ No newline at end of file
+        yield item