From 78c375dbc62164f53b236f55698edb5341b87fbe Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 28 Sep 2023 19:59:37 +0300 Subject: [PATCH] parse and save data, tests --- .gitignore | 2 +- docs/conf.py | 8 +- requirements.txt | 9 +- tests/test_wbparser.py | 101 +++++++--- wbparser/main.py | 184 ++++++++++++++++-- wbparser/wildberries/wildberries/items.py | 16 +- .../wildberries/wildberries/spiders/goods.py | 97 +++------ 7 files changed, 277 insertions(+), 140 deletions(-) diff --git a/.gitignore b/.gitignore index 68bc17f..2dc53ca 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ diff --git a/docs/conf.py b/docs/conf.py index 0b88c89..5b844d1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,15 +11,13 @@ sys.path.insert(0, os.path.abspath('..')) project = 'WBparser' -copyright = '2023, Maxim Liksakov' +copyright = '2023, Maxim Liksakov, Ilyas Gasanov' author = 'Maxim Liksakov, Ilyas Gasanov' release = '0.1.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = [] - templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] @@ -28,12 +26,12 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] extensions = [ 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon' + 'sphinx.ext.napoleon', ] master_doc = 'index' diff --git a/requirements.txt b/requirements.txt index f7a53df..7e900f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -scrapy -sphinx -twine -pytest \ No newline at end of file +scrapy==2.11.0 +sphinx==7.2.6 +sphinx_rtd_theme==1.3.0 +twine==4.0.2 +pytest==7.4.2 \ No newline at end of file diff --git a/tests/test_wbparser.py b/tests/test_wbparser.py index c3db4f1..3f80373 100644 --- a/tests/test_wbparser.py +++ b/tests/test_wbparser.py @@ -1,28 +1,81 @@ -import pytest -from unittest.mock import Mock +import csv +import json +import os +from os import listdir +from os.path import join, isfile from wbparser.main import WBparser -@pytest.fixture -def mock_parser(): - parser = WBparser('11152183') - parser.result = Mock(return_value={'brand': 'O`SHADE', - 'colors': [{'id': 0, 'name': 'черный'}], - 'diffPrice': False, - 'id': '11152183', - 'name': 'Ботинки женские натуральная кожа осенние', - 'pics': 16, - 'priceU': 757400, - 'qty': 1012, - 'questions': 1940, - 'rating': 4.7, - 'salePriceU': 560400, - 'sizes': ['36', '37', '38', '39', '40', '41', '42', '43']}) - return parser - - -def test_parse_data(mock_parser): - result = mock_parser.parse_data() - assert result is not None - assert 'id', 'name' in result +parser = WBparser() +parser.parse_data(ids=[11152183, 87628789], urls=['https://www.wildberries.ru/catalog/155761175/detail.aspx']) + + +mock_result = [ + { + 'id': '11152183', + 'name': 'Ботинки женские демисезонные натуральная кожа на шнуровке', + 'brand': 'O`SHADE', + 'priceU': 7574, + 'salePriceU': 5529, + 'picsAmt': 16, + 'colors': [{'name': 'черный', 'id': 0}], + 'sizes': ['36', '37', '38', '39', '40', '41', '42', '43'], + 'qty': 566, + 'rating': 4.7, + 'feedbacksAmt': 2055, + 'supplierId': 18744 + } +] + + +def test_parse_data_by_ids(): + + assert parser.result is not None + assert 11152183 in parser.ids + assert 87628789 in parser.ids + assert 'O`SHADE' in parser.brand + assert 'T.TACCARDI' in parser.brand + + +def test_parse_data_by_urls(): + + assert parser.result is not None + assert '155761175' in parser.ids + assert 'Avanti' in parser.brand + + +def test_save_data_to_json(): + + mock_parser = WBparser() + mock_parser.result = mock_result + + mock_parser.save_data(file_name='testfileJSON') + + assert 'testfileJSON.json' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))] + + with open('testfileJSON.json', 'r') as f: + data = json.loads(''.join(f.readlines()))[0] + + assert data['id'] == '11152183' + assert data['rating'] == 4.7 + assert data['sizes'] == ['36', '37', '38', '39', '40', '41', '42', '43'] + assert data['colors'] == [{'name': 'черный', 'id': 0}] + + +def test_save_data_to_csv(): + + mock_parser = WBparser() + mock_parser.result = mock_result + + mock_parser.save_data(file_name='testfileCSV', file_format='csv') + + assert 'testfileCSV.csv' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))] + + with open('testfileCSV.csv', mode='r') as f: + csv_reader = csv.DictReader(f) + data = [row for row in csv_reader][0] + + assert data['id'] == '11152183' + assert data['rating'] == '4.7' + assert data['sizes'] == "['36', '37', '38', '39', '40', '41', '42', '43']" diff --git a/wbparser/main.py b/wbparser/main.py index a79933e..d305386 100644 --- a/wbparser/main.py +++ b/wbparser/main.py @@ -1,45 +1,187 @@ +import csv +import json from dataclasses import dataclass +from typing import Union -from scrapy.crawler import CrawlerProcess, CrawlerRunner -from twisted.internet import reactor, defer +from scrapy.crawler import CrawlerRunner +from twisted.internet import reactor from wbparser.wildberries.wildberries.spiders.goods import GoodsSpider @dataclass class WBparser: - """ - A class used to parse data from Wildberries. + """A class used to parse data from Wildberries. + Pass several identifiers/urls to one WBparser object to make sure that Scrapy will work asynchronously. Example: + .. code-block:: python + parser = WBparser() + parser.parse_data(ids=[]) + parser.save_data() + Example: .. code:: python - - parser = WBparser() - parser.parse_data() - print(parser.result) - + parser = WBparser() + parser.parse_data(urls=[, , ,], ids=[, ,]) + parser.save_data(file_format='json', file_name='rareProducts') + print(parser.result) + print(parser.name) """ - id: int | str - result: dict = None + ids: Union[None, list[int], list[str]] = None + urls: Union[None, list[str]] = None + result: Union[None, list[dict]] = None def _crawl(self, runner): - deferred = runner.crawl(GoodsSpider, id=self.id, wb_parser=self) + deferred = runner.crawl(GoodsSpider, wb_parser=self) deferred.addBoth(lambda _: reactor.stop()) return deferred - def parse_data(self) -> dict: - """ - - Parses the data for the product id and updates the result attribute. + def parse_data( + self, + urls: Union[None, list[str]] = None, + ids: Union[None, list[int], list[str]] = None + ) -> list[dict]: + """Parses the data for the products identifiers or/and products pages. The data goes to the result attribute. - Returns - ------- - dict - The parsed data. + :param urls: Product pages to parse, defaults to None + :type urls: str, optional + :param ids: Products identifiers, defaults to None + :type ids: str, optional + :return: List of dicts, each of which contains info about parsed WB items + :rtype: list[dict] """ + self.urls = urls + self.ids = ids runner = CrawlerRunner() reactor.callLater(0, self._crawl, runner) reactor.run() - return self.result \ No newline at end of file + return self.result + + def save_data(self, file_format='json', file_name='goods', encoding='utf-8') -> None: + """Creates a file with the selected format and saves the parsing result to it. + + :param file_format: One of two possible formats (json or csv), defaults to json + :type file_format: str, optional + :param file_name: File name, defaults to goods + :type file_name: str, optional + :param encoding: Encoding type, defaults to utf-8 + :type encoding: str, optional + :return: Nothing is returned + :rtype: None + """ + if not file_name.isalnum(): + raise ValueError('Only letters and numbers are allowed in the filename') + + if self.result: + + if file_format == 'json': + with open(f'{file_name}.json', 'w', encoding=encoding) as f: + f.write(json.dumps(self.result, indent=0)) + + elif file_format == 'csv': + keys = self.result[0].keys() + with open(f'{file_name}.csv', 'w', newline='', encoding=encoding) as f: + dict_writer = csv.DictWriter(f, keys) + dict_writer.writeheader() + dict_writer.writerows(self.result) + + @property + def name(self) -> list[str]: + """Accesses the product name property. + + :return: Product names of parsed items + :rtype: list[str] + """ + return [dct.get('name') for dct in self.result] + + @property + def brand(self) -> list[str]: + """Accesses the brand property. + + :return: Brands of parsed items + :rtype: list[str] + """ + return [dct.get('brand') for dct in self.result] + + @property + def priceU(self) -> list[int]: + """Accesses the product price property. + + :return: Prices of parsed items + :rtype: list[int] + """ + return [dct.get('priceU') for dct in self.result] + + @property + def salePriceU(self) -> list[int]: + """Accesses the sale price property. + + :return: Sale prices of parsed items + :rtype: list[int] + """ + return [dct.get('salePriceU') for dct in self.result] + + @property + def picsAmt(self) -> list[int]: + """Accesses the pictures amount property. + + :return: Picture amounts of parsed items + :rtype: list[int] + """ + return [dct.get('picsAmt') for dct in self.result] + + @property + def colors(self) -> list[list[dict]]: + """Accesses the color property. + + :return: Colors of parsed items + :rtype: list[list[dict]] + """ + return [dct.get('colors') for dct in self.result] + + @property + def sizes(self) -> list[list[str]]: + """Accesses the size property. + + :return: Size values of parsed items + :rtype: list[list[str]] + """ + return [dct.get('sizes') for dct in self.result] + + @property + def qty(self) -> list[int]: + """Accesses the qty property. + + :return: Amount of available products for each parsed item + :rtype: list[int] + """ + return [dct.get('qty') for dct in self.result] + + @property + def supplierId(self) -> list[str]: + """Accesses the supplier identifier property. + + :return: Supplier identifiers of parsed items + :rtype: list[str] + """ + return [dct.get('supplierId') for dct in self.result] + + @property + def rating(self) -> list[float]: + """Accesses the rating property. + + :return: Rating values of parsed items + :rtype: list[float] + """ + return [dct.get('rating') for dct in self.result] + + @property + def feedbacksAmt(self) -> list[int]: + """Accesses the number of feedbacks property. + + :return: Number of feedbacks of parsed items + :rtype: list[int] + """ + return [dct.get('feedbacksAmt') for dct in self.result] diff --git a/wbparser/wildberries/wildberries/items.py b/wbparser/wildberries/wildberries/items.py index f9d2c1b..58d7499 100644 --- a/wbparser/wildberries/wildberries/items.py +++ b/wbparser/wildberries/wildberries/items.py @@ -3,24 +3,14 @@ class WildberriesItem(scrapy.Item): id = scrapy.Field() - inn = scrapy.Field() - supplierId = scrapy.Field() - supplierName = scrapy.Field() - legalAddress = scrapy.Field() name = scrapy.Field() brand = scrapy.Field() priceU = scrapy.Field() - sale = scrapy.Field() salePriceU = scrapy.Field() - pics = scrapy.Field() + picsAmt = scrapy.Field() colors = scrapy.Field() sizes = scrapy.Field() qty = scrapy.Field() - diffPrice = scrapy.Field() - price_history = scrapy.Field() + supplierId = scrapy.Field() rating = scrapy.Field() - comments = scrapy.Field() - sold = scrapy.Field() - description = scrapy.Field() - questions = scrapy.Field() - + feedbacksAmt = scrapy.Field() diff --git a/wbparser/wildberries/wildberries/spiders/goods.py b/wbparser/wildberries/wildberries/spiders/goods.py index 8ce6f1d..4c2e53d 100644 --- a/wbparser/wildberries/wildberries/spiders/goods.py +++ b/wbparser/wildberries/wildberries/spiders/goods.py @@ -1,6 +1,3 @@ -import json -import re - import scrapy from ..items import WildberriesItem @@ -13,8 +10,6 @@ '&stores=11767,117986,1733,686,132043' \ '&pricemarginCoeff=1.0®=0&appType=1&offlineBonus=0&onlineBonus=0&emp=0&locale=ru&' \ 'lang=ru&curr=rub&couponsGeo=12,3,18,15,21&dest=-1029256,-102269,-2162196,-1257786&nm={}' -# AJAX_REQUEST_PRICE_HISTORY = 'https://basket-01.wb.ru/vol{}/part{}/{}/info/price-history.json' -# AJAX_REQUEST_SELLERS = 'https://basket-01.wb.ru/vol{}/part{}/{}/info/sellers.json' class GoodsSpider(scrapy.Spider): @@ -23,92 +18,50 @@ class GoodsSpider(scrapy.Spider): allowed_domains = ['www.wildberries.ru', 'wbxcatalog-ru.wildberries.ru', 'wbx-content-v2.wbstatic.net', 'card.wb.ru', 'product-order-qnt.wildberries.ru', 'basket-01.wb.ru'] - def __init__(self, id=None, wb_parser=None, *args, **kwargs): + def __init__(self, wb_parser=None, *args, **kwargs): super(GoodsSpider, self).__init__(*args, **kwargs) - self.id = id self.wb_parser = wb_parser def start_requests(self): - item = WildberriesItem() - item['id'] = self.id - yield scrapy.Request(url=AJAX_REQUEST__GOOD_INFO.format(self.id), callback=self.parse_good_info, - cb_kwargs={'item': item, 'id': self.id}) - # def parse_page(self, response, **kwargs): - # item = kwargs['item'] - # id_ = kwargs['id'] - # if response.status in SUCCESS_CODES: - # item['rating'] = response.xpath('//span[contains(@class, "product-review__rating")]//text()').get() - # #item['comments'] = response.xpath("//span[@class='same-part-kt__count-review']/text()").get()[1:] - # item['description'] = response.xpath('//*[@id="container"]' - # '/div[3]/div[1]/section[3]/div[2]/div[1]/p/text()').get() - # item['questions'] = response.xpath('/html/body/div[1]/main/div[2]/div/div[2]/' - # 'section[2]/div[2]/ul/li[2]/a/span/text()').get() - # yield scrapy.Request(url=AJAX_REQUEST__GOOD_INFO.format(id_), callback=self.parse_good_info, - # cb_kwargs={'item': item, 'id': id_}) - # else: - # yield item + if self.wb_parser.urls: + for url in self.wb_parser.urls: + self.wb_parser.ids = self.wb_parser.ids or [] + self.wb_parser.ids.append(url.split('/')[-2]) + + if self.wb_parser.ids: + for id_to_crawl in self.wb_parser.ids: + item = WildberriesItem() + item['id'] = id_to_crawl + yield scrapy.Request(url=AJAX_REQUEST__GOOD_INFO.format(id_to_crawl), callback=self.parse_good_info, + cb_kwargs={'item': item, 'id': id_to_crawl}) def parse_good_info(self, response, **kwargs): + item = kwargs['item'] - id_ = kwargs['id'] if response.status in SUCCESS_CODES: data = response.json() item['name'] = data['data']['products'][0]['name'] item['brand'] = data['data']['products'][0]['brand'] - item['priceU'] = data['data']['products'][0]['priceU'] - item['salePriceU'] = data['data']['products'][0]['salePriceU'] - item['pics'] = data['data']['products'][0]['pics'] + item['priceU'] = data['data']['products'][0]['priceU'] // 100 + item['salePriceU'] = data['data']['products'][0]['salePriceU'] // 100 + item['picsAmt'] = data['data']['products'][0]['pics'] item['colors'] = data['data']['products'][0]['colors'] item['sizes'] = [] item['qty'] = 0 item['rating'] = data['data']['products'][0]['reviewRating'] - item['questions'] = data['data']['products'][0]["feedbacks"] - item['diffPrice'] = data['data']['products'][0]['diffPrice'] + item['feedbacksAmt'] = data['data']['products'][0]["feedbacks"] + item['supplierId'] = data['data']['products'][0]['supplierId'] + for size in data['data']['products'][0]['sizes']: item['sizes'].append(size['origName']) for stock in size['stocks']: item['qty'] += stock['qty'] - # yield scrapy.Request(url=AJAX_REQUEST_SELLERS.format(id_[:3], id_[:5], id_), callback=self.parse_sellers_info, - # cb_kwargs={'item': item, 'id': id_}) - if self.wb_parser: - self.wb_parser.result = item - yield item - - # def parse_sellers_info(self, response, **kwargs): - # item = kwargs['item'] - # id_ = kwargs['id'] - # if response.status in SUCCESS_CODES: - # data = response.json() - # try: - # item['supplierId'] = data['supplierId'] - # item['supplierName'] = data['supplierName'] - # item['inn'] = data['inn'] - # item['legalAddress'] = data['legalAddress'] - # except KeyError: - # pass - # id_ = str(id_) - # yield scrapy.Request(url=AJAX_REQUEST_PRICE_HISTORY.format(id_[:3], id_[:5], id_), callback=self.parse_history_info, - # cb_kwargs={'item': item, 'id': id_}) - # else: - # yield item + if self.wb_parser: + if not self.wb_parser.result: + self.wb_parser.result = [dict(item)] + else: + self.wb_parser.result.append((dict(item))) - # def parse_history_info(self, response, **kwargs): - # item = kwargs['item'] - # if response.status in SUCCESS_CODES: - # data = response.json() - # item['price_history'] = data - # yield scrapy.Request(url=f'https://product-order-qnt.wildberries.ru/by-nm/?nm={kwargs.get("id")}', - # callback=self.parse_qnt_info, cb_kwargs={'item': item, 'id': kwargs.get("id")}) - # else: - # yield scrapy.Request(url=f'https://product-order-qnt.wildberries.ru/by-nm/?nm={kwargs.get("id")}', - # callback=self.parse_qnt_info, cb_kwargs={'item': item, 'id': kwargs.get("id")}) - # - # def parse_qnt_info(self, response, **kwargs): - # item = kwargs['item'] - # if response.status in SUCCESS_CODES: - # data = response.json() - # qnt = data[0]['qnt'] - # item['qty'] = data[0]['qnt'] - # yield item \ No newline at end of file + yield item