Skip to content

Commit

Permalink
Merge pull request #2 from maxim-lixakov/feature_1
Browse files Browse the repository at this point in the history
parse and save data, tests
  • Loading branch information
maxim-lixakov authored Sep 28, 2023
2 parents 219eea9 + 78c375d commit 6b45471
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 140 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
8 changes: 3 additions & 5 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@
sys.path.insert(0, os.path.abspath('..'))

project = 'WBparser'
copyright = '2023, Maxim Liksakov'
copyright = '2023, Maxim Liksakov, Ilyas Gasanov'
author = 'Maxim Liksakov, Ilyas Gasanov'
release = '0.1.0'

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

extensions = []

templates_path = ['_templates']
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

Expand All @@ -28,12 +26,12 @@
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'alabaster'
html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static']

extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon'
'sphinx.ext.napoleon',
]

master_doc = 'index'
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
scrapy
sphinx
twine
pytest
scrapy==2.11.0
sphinx==7.2.6
sphinx_rtd_theme==1.3.0
twine==4.0.2
pytest==7.4.2
101 changes: 77 additions & 24 deletions tests/test_wbparser.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,81 @@
import pytest
from unittest.mock import Mock
import csv
import json
import os
from os import listdir
from os.path import join, isfile

from wbparser.main import WBparser


@pytest.fixture
def mock_parser():
parser = WBparser('11152183')
parser.result = Mock(return_value={'brand': 'O`SHADE',
'colors': [{'id': 0, 'name': 'черный'}],
'diffPrice': False,
'id': '11152183',
'name': 'Ботинки женские натуральная кожа осенние',
'pics': 16,
'priceU': 757400,
'qty': 1012,
'questions': 1940,
'rating': 4.7,
'salePriceU': 560400,
'sizes': ['36', '37', '38', '39', '40', '41', '42', '43']})
return parser


def test_parse_data(mock_parser):
result = mock_parser.parse_data()
assert result is not None
assert 'id', 'name' in result
parser = WBparser()
parser.parse_data(ids=[11152183, 87628789], urls=['https://www.wildberries.ru/catalog/155761175/detail.aspx'])


mock_result = [
{
'id': '11152183',
'name': 'Ботинки женские демисезонные натуральная кожа на шнуровке',
'brand': 'O`SHADE',
'priceU': 7574,
'salePriceU': 5529,
'picsAmt': 16,
'colors': [{'name': 'черный', 'id': 0}],
'sizes': ['36', '37', '38', '39', '40', '41', '42', '43'],
'qty': 566,
'rating': 4.7,
'feedbacksAmt': 2055,
'supplierId': 18744
}
]


def test_parse_data_by_ids():

assert parser.result is not None
assert 11152183 in parser.ids
assert 87628789 in parser.ids
assert 'O`SHADE' in parser.brand
assert 'T.TACCARDI' in parser.brand


def test_parse_data_by_urls():

assert parser.result is not None
assert '155761175' in parser.ids
assert 'Avanti' in parser.brand


def test_save_data_to_json():

mock_parser = WBparser()
mock_parser.result = mock_result

mock_parser.save_data(file_name='testfileJSON')

assert 'testfileJSON.json' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]

with open('testfileJSON.json', 'r') as f:
data = json.loads(''.join(f.readlines()))[0]

assert data['id'] == '11152183'
assert data['rating'] == 4.7
assert data['sizes'] == ['36', '37', '38', '39', '40', '41', '42', '43']
assert data['colors'] == [{'name': 'черный', 'id': 0}]


def test_save_data_to_csv():

mock_parser = WBparser()
mock_parser.result = mock_result

mock_parser.save_data(file_name='testfileCSV', file_format='csv')

assert 'testfileCSV.csv' in [f for f in listdir(os.getcwd()) if isfile(join(os.getcwd(), f))]

with open('testfileCSV.csv', mode='r') as f:
csv_reader = csv.DictReader(f)
data = [row for row in csv_reader][0]

assert data['id'] == '11152183'
assert data['rating'] == '4.7'
assert data['sizes'] == "['36', '37', '38', '39', '40', '41', '42', '43']"
184 changes: 163 additions & 21 deletions wbparser/main.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,187 @@
import csv
import json
from dataclasses import dataclass
from typing import Union

from scrapy.crawler import CrawlerProcess, CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor

from wbparser.wildberries.wildberries.spiders.goods import GoodsSpider


@dataclass
class WBparser:
"""
A class used to parse data from Wildberries.
"""A class used to parse data from Wildberries.
Pass several identifiers/urls to one WBparser object to make sure that Scrapy will work asynchronously.
Example:
.. code-block:: python
parser = WBparser()
parser.parse_data(ids=[<product_id_1>])
parser.save_data()
Example:
.. code:: python
parser = WBparser(<product_id>)
parser.parse_data()
print(parser.result)
parser = WBparser()
parser.parse_data(urls=[<url_1>, <url_2>, <url_3>,], ids=[<product_id_1>, <product_id_2>,])
parser.save_data(file_format='json', file_name='rareProducts')
print(parser.result)
print(parser.name)
"""

id: int | str
result: dict = None
ids: Union[None, list[int], list[str]] = None
urls: Union[None, list[str]] = None
result: Union[None, list[dict]] = None

def _crawl(self, runner):
deferred = runner.crawl(GoodsSpider, id=self.id, wb_parser=self)
deferred = runner.crawl(GoodsSpider, wb_parser=self)
deferred.addBoth(lambda _: reactor.stop())
return deferred

def parse_data(self) -> dict:
"""
Parses the data for the product id and updates the result attribute.
def parse_data(
self,
urls: Union[None, list[str]] = None,
ids: Union[None, list[int], list[str]] = None
) -> list[dict]:
"""Parses the data for the products identifiers or/and products pages. The data goes to the result attribute.
Returns
-------
dict
The parsed data.
:param urls: Product pages to parse, defaults to None
:type urls: str, optional
:param ids: Products identifiers, defaults to None
:type ids: str, optional
:return: List of dicts, each of which contains info about parsed WB items
:rtype: list[dict]
"""
self.urls = urls
self.ids = ids
runner = CrawlerRunner()
reactor.callLater(0, self._crawl, runner)
reactor.run()
return self.result
return self.result

def save_data(self, file_format='json', file_name='goods', encoding='utf-8') -> None:
"""Creates a file with the selected format and saves the parsing result to it.
:param file_format: One of two possible formats (json or csv), defaults to json
:type file_format: str, optional
:param file_name: File name, defaults to goods
:type file_name: str, optional
:param encoding: Encoding type, defaults to utf-8
:type encoding: str, optional
:return: Nothing is returned
:rtype: None
"""
if not file_name.isalnum():
raise ValueError('Only letters and numbers are allowed in the filename')

if self.result:

if file_format == 'json':
with open(f'{file_name}.json', 'w', encoding=encoding) as f:
f.write(json.dumps(self.result, indent=0))

elif file_format == 'csv':
keys = self.result[0].keys()
with open(f'{file_name}.csv', 'w', newline='', encoding=encoding) as f:
dict_writer = csv.DictWriter(f, keys)
dict_writer.writeheader()
dict_writer.writerows(self.result)

@property
def name(self) -> list[str]:
"""Accesses the product name property.
:return: Product names of parsed items
:rtype: list[str]
"""
return [dct.get('name') for dct in self.result]

@property
def brand(self) -> list[str]:
"""Accesses the brand property.
:return: Brands of parsed items
:rtype: list[str]
"""
return [dct.get('brand') for dct in self.result]

@property
def priceU(self) -> list[int]:
"""Accesses the product price property.
:return: Prices of parsed items
:rtype: list[int]
"""
return [dct.get('priceU') for dct in self.result]

@property
def salePriceU(self) -> list[int]:
"""Accesses the sale price property.
:return: Sale prices of parsed items
:rtype: list[int]
"""
return [dct.get('salePriceU') for dct in self.result]

@property
def picsAmt(self) -> list[int]:
"""Accesses the pictures amount property.
:return: Picture amounts of parsed items
:rtype: list[int]
"""
return [dct.get('picsAmt') for dct in self.result]

@property
def colors(self) -> list[list[dict]]:
"""Accesses the color property.
:return: Colors of parsed items
:rtype: list[list[dict]]
"""
return [dct.get('colors') for dct in self.result]

@property
def sizes(self) -> list[list[str]]:
"""Accesses the size property.
:return: Size values of parsed items
:rtype: list[list[str]]
"""
return [dct.get('sizes') for dct in self.result]

@property
def qty(self) -> list[int]:
"""Accesses the qty property.
:return: Amount of available products for each parsed item
:rtype: list[int]
"""
return [dct.get('qty') for dct in self.result]

@property
def supplierId(self) -> list[str]:
"""Accesses the supplier identifier property.
:return: Supplier identifiers of parsed items
:rtype: list[str]
"""
return [dct.get('supplierId') for dct in self.result]

@property
def rating(self) -> list[float]:
"""Accesses the rating property.
:return: Rating values of parsed items
:rtype: list[float]
"""
return [dct.get('rating') for dct in self.result]

@property
def feedbacksAmt(self) -> list[int]:
"""Accesses the number of feedbacks property.
:return: Number of feedbacks of parsed items
:rtype: list[int]
"""
return [dct.get('feedbacksAmt') for dct in self.result]
16 changes: 3 additions & 13 deletions wbparser/wildberries/wildberries/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,14 @@

class WildberriesItem(scrapy.Item):
id = scrapy.Field()
inn = scrapy.Field()
supplierId = scrapy.Field()
supplierName = scrapy.Field()
legalAddress = scrapy.Field()
name = scrapy.Field()
brand = scrapy.Field()
priceU = scrapy.Field()
sale = scrapy.Field()
salePriceU = scrapy.Field()
pics = scrapy.Field()
picsAmt = scrapy.Field()
colors = scrapy.Field()
sizes = scrapy.Field()
qty = scrapy.Field()
diffPrice = scrapy.Field()
price_history = scrapy.Field()
supplierId = scrapy.Field()
rating = scrapy.Field()
comments = scrapy.Field()
sold = scrapy.Field()
description = scrapy.Field()
questions = scrapy.Field()

feedbacksAmt = scrapy.Field()
Loading

0 comments on commit 6b45471

Please sign in to comment.