parse and save data, tests

maxim-lixakov · Sep 28, 2023 · 78c375d · 78c375d
1 parent ad1a832
commit 78c375d
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 140 deletions.
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
diff --git a/docs/conf.py b/docs/conf.py
@@ -11,15 +11,13 @@
 sys.path.insert(0, os.path.abspath('..'))
 
 project = 'WBparser'
-copyright = '2023, Maxim Liksakov'
+copyright = '2023, Maxim Liksakov, Ilyas Gasanov'
 author = 'Maxim Liksakov, Ilyas Gasanov'
 release = '0.1.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
-extensions = []
-
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
@@ -28,12 +26,12 @@
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 html_static_path = ['_static']
 
 extensions = [
     'sphinx.ext.autodoc',
-    'sphinx.ext.napoleon'
+    'sphinx.ext.napoleon',
 ]
 
 master_doc = 'index'
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
-scrapy
-sphinx
-twine
-pytest
+scrapy==2.11.0
+sphinx==7.2.6
+sphinx_rtd_theme==1.3.0
+twine==4.0.2
+pytest==7.4.2
diff --git a/tests/test_wbparser.py b/tests/test_wbparser.py
@@ -1,28 +1,81 @@
-import pytest
-from unittest.mock import Mock
+import csv
+import json
+import os
+from os import listdir
+from os.path import join, isfile
 
 from wbparser.main import WBparser
 
 
-@pytest.fixture
-def mock_parser():
-    parser = WBparser('11152183')
-    parser.result = Mock(return_value={'brand': 'O`SHADE',
-                                            'colors': [{'id': 0, 'name': 'черный'}],
-                                            'diffPrice': False,
-                                            'id': '11152183',
-                                            'name': 'Ботинки женские натуральная кожа осенние',
-                                            'pics': 16,
-                                            'priceU': 757400,
-                                            'qty': 1012,
-                                            'questions': 1940,
-                                            'rating': 4.7,
-                                            'salePriceU': 560400,
-                                            'sizes': ['36', '37', '38', '39', '40', '41', '42', '43']})
-    return parser
-
-
-def test_parse_data(mock_parser):
-    result = mock_parser.parse_data()
-    assert result is not None
-    assert 'id', 'name' in result
+parser = WBparser()
+parser.parse_data(ids=[11152183, 87628789], urls=['https://www.wildberries.ru/catalog/155761175/detail.aspx'])
+
+
+mock_result = [
+    {
+        'id': '11152183',
+        'name': 'Ботинки женские демисезонные натуральная кожа на шнуровке',
+        'brand': 'O`SHADE',
+        'priceU': 7574,
+        'salePriceU': 5529,
+        'picsAmt': 16,
+        'colors': [{'name': 'черный', 'id': 0}],
+        'sizes': ['36', '37', '38', '39', '40', '41', '42', '43'],
+        'qty': 566,
+        'rating': 4.7,
+        'feedbacksAmt': 2055,
+        'supplierId': 18744
+    }
+]
+
+
+def test_parse_data_by_ids():
+
+    assert parser.result is not None
+    assert 11152183 in parser.ids
+    assert 87628789 in parser.ids
+    assert 'O`SHADE' in parser.brand
+    assert 'T.TACCARDI' in parser.brand
+
+
+def test_parse_data_by_urls():
+
+    assert parser.result is not None
+    assert '155761175' in parser.ids
+    assert 'Avanti' in parser.brand
+
+
+def test_save_data_to_json():
+
+    mock_parser = WBparser()
+    mock_parser.result = mock_result
+
+    mock_parser.save_data(file_name='testfileJSON')
+
+    assert 'testfileJSON.json' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]
+
+    with open('testfileJSON.json', 'r') as f:
+        data = json.loads(''.join(f.readlines()))[0]
+
+    assert data['id'] == '11152183'
+    assert data['rating'] == 4.7
+    assert data['sizes'] == ['36', '37', '38', '39', '40', '41', '42', '43']
+    assert data['colors'] == [{'name': 'черный', 'id': 0}]
+
+
+def test_save_data_to_csv():
+
+    mock_parser = WBparser()
+    mock_parser.result = mock_result
+
+    mock_parser.save_data(file_name='testfileCSV', file_format='csv')
+
+    assert 'testfileCSV.csv' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]
+
+    with open('testfileCSV.csv', mode='r') as f:
+        csv_reader = csv.DictReader(f)
+        data = [row for row in csv_reader][0]
+
+    assert data['id'] == '11152183'
+    assert data['rating'] == '4.7'
+    assert data['sizes'] == "['36', '37', '38', '39', '40', '41', '42', '43']"
diff --git a/wbparser/main.py b/wbparser/main.py
@@ -1,45 +1,187 @@
+import csv
+import json
 from dataclasses import dataclass
+from typing import Union
 
-from scrapy.crawler import CrawlerProcess, CrawlerRunner
-from twisted.internet import reactor, defer
+from scrapy.crawler import CrawlerRunner
+from twisted.internet import reactor
 
 from wbparser.wildberries.wildberries.spiders.goods import GoodsSpider
 
 
 @dataclass
 class WBparser:
-    """
-    A class used to parse data from Wildberries.
+    """A class used to parse data from Wildberries.
+    Pass several identifiers/urls to one WBparser object to make sure that Scrapy will work asynchronously.
 
     Example:
+    .. code-block:: python
+        parser = WBparser()
+        parser.parse_data(ids=[<product_id_1>])
+        parser.save_data()
 
+    Example:
     .. code:: python
-
-            parser = WBparser(<product_id>)
-            parser.parse_data()
-            print(parser.result)
-
+        parser = WBparser()
+        parser.parse_data(urls=[<url_1>, <url_2>, <url_3>,], ids=[<product_id_1>, <product_id_2>,])
+        parser.save_data(file_format='json', file_name='rareProducts')
+        print(parser.result)
+        print(parser.name)
     """
 
-    id: int | str
-    result: dict = None
+    ids: Union[None, list[int], list[str]] = None
+    urls: Union[None, list[str]] = None
+    result: Union[None, list[dict]] = None
 
     def _crawl(self, runner):
-        deferred = runner.crawl(GoodsSpider, id=self.id, wb_parser=self)
+        deferred = runner.crawl(GoodsSpider, wb_parser=self)
         deferred.addBoth(lambda _: reactor.stop())
         return deferred
 
-    def parse_data(self) -> dict:
-        """
-
-        Parses the data for the product id and updates the result attribute.
+    def parse_data(
+            self,
+            urls: Union[None, list[str]] = None,
+            ids: Union[None, list[int], list[str]] = None
+    ) -> list[dict]:
+        """Parses the data for the products identifiers or/and products pages. The data goes to the result attribute.
 
-        Returns
-        -------
-        dict
-            The parsed data.
+        :param urls: Product pages to parse, defaults to None
+        :type urls: str, optional
+        :param ids: Products identifiers, defaults to None
+        :type ids: str, optional
+        :return: List of dicts, each of which contains info about parsed WB items
+        :rtype: list[dict]
         """
+        self.urls = urls
+        self.ids = ids
         runner = CrawlerRunner()
         reactor.callLater(0, self._crawl, runner)
         reactor.run()
-        return self.result
+        return self.result
+
+    def save_data(self, file_format='json', file_name='goods', encoding='utf-8') -> None:
+        """Creates a file with the selected format and saves the parsing result to it.
+
+        :param file_format: One of two possible formats (json or csv), defaults to json
+        :type file_format: str, optional
+        :param file_name: File name, defaults to goods
+        :type file_name: str, optional
+        :param encoding: Encoding type, defaults to utf-8
+        :type encoding: str, optional
+        :return: Nothing is returned
+        :rtype: None
+        """
+        if not file_name.isalnum():
+            raise ValueError('Only letters and numbers are allowed in the filename')
+
+        if self.result:
+
+            if file_format == 'json':
+                with open(f'{file_name}.json', 'w', encoding=encoding) as f:
+                    f.write(json.dumps(self.result, indent=0))
+
+            elif file_format == 'csv':
+                keys = self.result[0].keys()
+                with open(f'{file_name}.csv', 'w', newline='', encoding=encoding) as f:
+                    dict_writer = csv.DictWriter(f, keys)
+                    dict_writer.writeheader()
+                    dict_writer.writerows(self.result)
+
+    @property
+    def name(self) -> list[str]:
+        """Accesses the product name property.
+
+        :return: Product names of parsed items
+        :rtype: list[str]
+        """
+        return [dct.get('name') for dct in self.result]
+
+    @property
+    def brand(self) -> list[str]:
+        """Accesses the brand property.
+
+        :return: Brands of parsed items
+        :rtype: list[str]
+        """
+        return [dct.get('brand') for dct in self.result]
+
+    @property
+    def priceU(self) -> list[int]:
+        """Accesses the product price property.
+
+        :return: Prices of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('priceU') for dct in self.result]
+
+    @property
+    def salePriceU(self) -> list[int]:
+        """Accesses the sale price property.
+
+        :return: Sale prices of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('salePriceU') for dct in self.result]
+
+    @property
+    def picsAmt(self) -> list[int]:
+        """Accesses the pictures amount property.
+
+        :return: Picture amounts of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('picsAmt') for dct in self.result]
+
+    @property
+    def colors(self) -> list[list[dict]]:
+        """Accesses the color property.
+
+        :return: Colors of parsed items
+        :rtype: list[list[dict]]
+        """
+        return [dct.get('colors') for dct in self.result]
+
+    @property
+    def sizes(self) -> list[list[str]]:
+        """Accesses the size property.
+
+        :return: Size values of parsed items
+        :rtype: list[list[str]]
+        """
+        return [dct.get('sizes') for dct in self.result]
+
+    @property
+    def qty(self) -> list[int]:
+        """Accesses the qty property.
+
+        :return: Amount of available products for each parsed item
+        :rtype: list[int]
+        """
+        return [dct.get('qty') for dct in self.result]
+
+    @property
+    def supplierId(self) -> list[str]:
+        """Accesses the supplier identifier property.
+
+        :return: Supplier identifiers of parsed items
+        :rtype: list[str]
+        """
+        return [dct.get('supplierId') for dct in self.result]
+
+    @property
+    def rating(self) -> list[float]:
+        """Accesses the rating property.
+
+        :return: Rating values of parsed items
+        :rtype: list[float]
+        """
+        return [dct.get('rating') for dct in self.result]
+
+    @property
+    def feedbacksAmt(self) -> list[int]:
+        """Accesses the number of feedbacks property.
+
+        :return: Number of feedbacks of parsed items
+        :rtype: list[int]
+        """
+        return [dct.get('feedbacksAmt') for dct in self.result]
diff --git a/wbparser/wildberries/wildberries/items.py b/wbparser/wildberries/wildberries/items.py
@@ -3,24 +3,14 @@
 
 class WildberriesItem(scrapy.Item):
     id = scrapy.Field()
-    inn = scrapy.Field()
-    supplierId = scrapy.Field()
-    supplierName = scrapy.Field()
-    legalAddress = scrapy.Field()
     name = scrapy.Field()
     brand = scrapy.Field()
     priceU = scrapy.Field()
-    sale = scrapy.Field()
     salePriceU = scrapy.Field()
-    pics = scrapy.Field()
+    picsAmt = scrapy.Field()
     colors = scrapy.Field()
     sizes = scrapy.Field()
     qty = scrapy.Field()
-    diffPrice = scrapy.Field()
-    price_history = scrapy.Field()
+    supplierId = scrapy.Field()
     rating = scrapy.Field()
-    comments = scrapy.Field()
-    sold = scrapy.Field()
-    description = scrapy.Field()
-    questions = scrapy.Field()
-
+    feedbacksAmt = scrapy.Field()