-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from maxim-lixakov/feature_1
parse and save data, tests
- Loading branch information
Showing
7 changed files
with
277 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
scrapy | ||
sphinx | ||
twine | ||
pytest | ||
scrapy==2.11.0 | ||
sphinx==7.2.6 | ||
sphinx_rtd_theme==1.3.0 | ||
twine==4.0.2 | ||
pytest==7.4.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,81 @@ | ||
import pytest | ||
from unittest.mock import Mock | ||
import csv | ||
import json | ||
import os | ||
from os import listdir | ||
from os.path import join, isfile | ||
|
||
from wbparser.main import WBparser | ||
|
||
|
||
@pytest.fixture | ||
def mock_parser(): | ||
parser = WBparser('11152183') | ||
parser.result = Mock(return_value={'brand': 'O`SHADE', | ||
'colors': [{'id': 0, 'name': 'черный'}], | ||
'diffPrice': False, | ||
'id': '11152183', | ||
'name': 'Ботинки женские натуральная кожа осенние', | ||
'pics': 16, | ||
'priceU': 757400, | ||
'qty': 1012, | ||
'questions': 1940, | ||
'rating': 4.7, | ||
'salePriceU': 560400, | ||
'sizes': ['36', '37', '38', '39', '40', '41', '42', '43']}) | ||
return parser | ||
|
||
|
||
def test_parse_data(mock_parser): | ||
result = mock_parser.parse_data() | ||
assert result is not None | ||
assert 'id', 'name' in result | ||
parser = WBparser() | ||
parser.parse_data(ids=[11152183, 87628789], urls=['https://www.wildberries.ru/catalog/155761175/detail.aspx']) | ||
|
||
|
||
mock_result = [ | ||
{ | ||
'id': '11152183', | ||
'name': 'Ботинки женские демисезонные натуральная кожа на шнуровке', | ||
'brand': 'O`SHADE', | ||
'priceU': 7574, | ||
'salePriceU': 5529, | ||
'picsAmt': 16, | ||
'colors': [{'name': 'черный', 'id': 0}], | ||
'sizes': ['36', '37', '38', '39', '40', '41', '42', '43'], | ||
'qty': 566, | ||
'rating': 4.7, | ||
'feedbacksAmt': 2055, | ||
'supplierId': 18744 | ||
} | ||
] | ||
|
||
|
||
def test_parse_data_by_ids(): | ||
|
||
assert parser.result is not None | ||
assert 11152183 in parser.ids | ||
assert 87628789 in parser.ids | ||
assert 'O`SHADE' in parser.brand | ||
assert 'T.TACCARDI' in parser.brand | ||
|
||
|
||
def test_parse_data_by_urls(): | ||
|
||
assert parser.result is not None | ||
assert '155761175' in parser.ids | ||
assert 'Avanti' in parser.brand | ||
|
||
|
||
def test_save_data_to_json(): | ||
|
||
mock_parser = WBparser() | ||
mock_parser.result = mock_result | ||
|
||
mock_parser.save_data(file_name='testfileJSON') | ||
|
||
assert 'testfileJSON.json' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))] | ||
|
||
with open('testfileJSON.json', 'r') as f: | ||
data = json.loads(''.join(f.readlines()))[0] | ||
|
||
assert data['id'] == '11152183' | ||
assert data['rating'] == 4.7 | ||
assert data['sizes'] == ['36', '37', '38', '39', '40', '41', '42', '43'] | ||
assert data['colors'] == [{'name': 'черный', 'id': 0}] | ||
|
||
|
||
def test_save_data_to_csv(): | ||
|
||
mock_parser = WBparser() | ||
mock_parser.result = mock_result | ||
|
||
mock_parser.save_data(file_name='testfileCSV', file_format='csv') | ||
|
||
assert 'testfileCSV.csv' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))] | ||
|
||
with open('testfileCSV.csv', mode='r') as f: | ||
csv_reader = csv.DictReader(f) | ||
data = [row for row in csv_reader][0] | ||
|
||
assert data['id'] == '11152183' | ||
assert data['rating'] == '4.7' | ||
assert data['sizes'] == "['36', '37', '38', '39', '40', '41', '42', '43']" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,187 @@ | ||
import csv | ||
import json | ||
from dataclasses import dataclass | ||
from typing import Union | ||
|
||
from scrapy.crawler import CrawlerProcess, CrawlerRunner | ||
from twisted.internet import reactor, defer | ||
from scrapy.crawler import CrawlerRunner | ||
from twisted.internet import reactor | ||
|
||
from wbparser.wildberries.wildberries.spiders.goods import GoodsSpider | ||
|
||
|
||
@dataclass | ||
class WBparser: | ||
""" | ||
A class used to parse data from Wildberries. | ||
"""A class used to parse data from Wildberries. | ||
Pass several identifiers/urls to one WBparser object to make sure that Scrapy will work asynchronously. | ||
Example: | ||
.. code-block:: python | ||
parser = WBparser() | ||
parser.parse_data(ids=[<product_id_1>]) | ||
parser.save_data() | ||
Example: | ||
.. code:: python | ||
parser = WBparser(<product_id>) | ||
parser.parse_data() | ||
print(parser.result) | ||
parser = WBparser() | ||
parser.parse_data(urls=[<url_1>, <url_2>, <url_3>,], ids=[<product_id_1>, <product_id_2>,]) | ||
parser.save_data(file_format='json', file_name='rareProducts') | ||
print(parser.result) | ||
print(parser.name) | ||
""" | ||
|
||
id: int | str | ||
result: dict = None | ||
ids: Union[None, list[int], list[str]] = None | ||
urls: Union[None, list[str]] = None | ||
result: Union[None, list[dict]] = None | ||
|
||
def _crawl(self, runner): | ||
deferred = runner.crawl(GoodsSpider, id=self.id, wb_parser=self) | ||
deferred = runner.crawl(GoodsSpider, wb_parser=self) | ||
deferred.addBoth(lambda _: reactor.stop()) | ||
return deferred | ||
|
||
def parse_data(self) -> dict: | ||
""" | ||
Parses the data for the product id and updates the result attribute. | ||
def parse_data( | ||
self, | ||
urls: Union[None, list[str]] = None, | ||
ids: Union[None, list[int], list[str]] = None | ||
) -> list[dict]: | ||
"""Parses the data for the products identifiers or/and products pages. The data goes to the result attribute. | ||
Returns | ||
------- | ||
dict | ||
The parsed data. | ||
:param urls: Product pages to parse, defaults to None | ||
:type urls: str, optional | ||
:param ids: Products identifiers, defaults to None | ||
:type ids: str, optional | ||
:return: List of dicts, each of which contains info about parsed WB items | ||
:rtype: list[dict] | ||
""" | ||
self.urls = urls | ||
self.ids = ids | ||
runner = CrawlerRunner() | ||
reactor.callLater(0, self._crawl, runner) | ||
reactor.run() | ||
return self.result | ||
return self.result | ||
|
||
def save_data(self, file_format='json', file_name='goods', encoding='utf-8') -> None: | ||
"""Creates a file with the selected format and saves the parsing result to it. | ||
:param file_format: One of two possible formats (json or csv), defaults to json | ||
:type file_format: str, optional | ||
:param file_name: File name, defaults to goods | ||
:type file_name: str, optional | ||
:param encoding: Encoding type, defaults to utf-8 | ||
:type encoding: str, optional | ||
:return: Nothing is returned | ||
:rtype: None | ||
""" | ||
if not file_name.isalnum(): | ||
raise ValueError('Only letters and numbers are allowed in the filename') | ||
|
||
if self.result: | ||
|
||
if file_format == 'json': | ||
with open(f'{file_name}.json', 'w', encoding=encoding) as f: | ||
f.write(json.dumps(self.result, indent=0)) | ||
|
||
elif file_format == 'csv': | ||
keys = self.result[0].keys() | ||
with open(f'{file_name}.csv', 'w', newline='', encoding=encoding) as f: | ||
dict_writer = csv.DictWriter(f, keys) | ||
dict_writer.writeheader() | ||
dict_writer.writerows(self.result) | ||
|
||
@property | ||
def name(self) -> list[str]: | ||
"""Accesses the product name property. | ||
:return: Product names of parsed items | ||
:rtype: list[str] | ||
""" | ||
return [dct.get('name') for dct in self.result] | ||
|
||
@property | ||
def brand(self) -> list[str]: | ||
"""Accesses the brand property. | ||
:return: Brands of parsed items | ||
:rtype: list[str] | ||
""" | ||
return [dct.get('brand') for dct in self.result] | ||
|
||
@property | ||
def priceU(self) -> list[int]: | ||
"""Accesses the product price property. | ||
:return: Prices of parsed items | ||
:rtype: list[int] | ||
""" | ||
return [dct.get('priceU') for dct in self.result] | ||
|
||
@property | ||
def salePriceU(self) -> list[int]: | ||
"""Accesses the sale price property. | ||
:return: Sale prices of parsed items | ||
:rtype: list[int] | ||
""" | ||
return [dct.get('salePriceU') for dct in self.result] | ||
|
||
@property | ||
def picsAmt(self) -> list[int]: | ||
"""Accesses the pictures amount property. | ||
:return: Picture amounts of parsed items | ||
:rtype: list[int] | ||
""" | ||
return [dct.get('picsAmt') for dct in self.result] | ||
|
||
@property | ||
def colors(self) -> list[list[dict]]: | ||
"""Accesses the color property. | ||
:return: Colors of parsed items | ||
:rtype: list[list[dict]] | ||
""" | ||
return [dct.get('colors') for dct in self.result] | ||
|
||
@property | ||
def sizes(self) -> list[list[str]]: | ||
"""Accesses the size property. | ||
:return: Size values of parsed items | ||
:rtype: list[list[str]] | ||
""" | ||
return [dct.get('sizes') for dct in self.result] | ||
|
||
@property | ||
def qty(self) -> list[int]: | ||
"""Accesses the qty property. | ||
:return: Amount of available products for each parsed item | ||
:rtype: list[int] | ||
""" | ||
return [dct.get('qty') for dct in self.result] | ||
|
||
@property | ||
def supplierId(self) -> list[str]: | ||
"""Accesses the supplier identifier property. | ||
:return: Supplier identifiers of parsed items | ||
:rtype: list[str] | ||
""" | ||
return [dct.get('supplierId') for dct in self.result] | ||
|
||
@property | ||
def rating(self) -> list[float]: | ||
"""Accesses the rating property. | ||
:return: Rating values of parsed items | ||
:rtype: list[float] | ||
""" | ||
return [dct.get('rating') for dct in self.result] | ||
|
||
@property | ||
def feedbacksAmt(self) -> list[int]: | ||
"""Accesses the number of feedbacks property. | ||
:return: Number of feedbacks of parsed items | ||
:rtype: list[int] | ||
""" | ||
return [dct.get('feedbacksAmt') for dct in self.result] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.