Skip to content

Commit

Permalink
Merge branch 'main' of github.com:open-contracting/kingfisher-collect
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Sep 6, 2024
2 parents f58645c + 5f2735d commit 7d61e2e
Show file tree
Hide file tree
Showing 21 changed files with 182 additions and 165 deletions.
12 changes: 5 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,18 @@ jobs:
python-version: '3.10'
cache: pip
cache-dependency-path: '**/requirements*.txt'
# Don't install editable projects in the current working directory.
# https://pip.pypa.io/en/latest/reference/pip_install/#install-src
- run: pip install --src $GITHUB_WORKSPACE/../src -r requirements_dev.txt
- run: pip install -r requirements_dev.txt
- env:
KINGFISHER_COLLECT_DATABASE_URL: postgresql://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres
# Use 127.0.0.1 to avoid log messages about IPv6.
RABBIT_URL: amqp://127.0.0.1:${{ job.services.rabbitmq.ports[5672] }}
KINGFISHER_API2_TEST_URL: http://localhost:${{ job.services.httpbin.ports[8080] }}/anything/
# For requests.post() in KingfisherProcessAPI2._post_synchronous().
run: pytest -W error -W ignore::ResourceWarning -rs --cov kingfisher_scrapy
# https://github.com/pytest-dev/pytest-twisted/issues/183
# https://github.com/scrapy/scrapy/issues/6450
run: pytest --cov-report=lcov:coverage/lcov.info -W error -W ignore::ResourceWarning -W ignore::DeprecationWarning:pytest_twisted -W ignore::DeprecationWarning:scrapy.core.downloader.webclient -rs --cov kingfisher_scrapy
- run: python test_delayed_request_middleware.py
- env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: coveralls --service=github
- uses: coverallsapp/github-action@v2
services:
postgres:
image: postgres:15
Expand Down
20 changes: 16 additions & 4 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,34 @@ jobs:
build:
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
runs-on: ubuntu-latest
env:
PAT: ${{ secrets.PAT }}
steps:
- uses: actions/checkout@v4
with:
token: ${{ secrets.PAT || github.token }}
- uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: pip
cache-dependency-path: '**/requirements*.txt'
- run: shasum -c requirements.txt.sha256
- id: changed-files
uses: tj-actions/changed-files@v45
- uses: pre-commit/[email protected]
continue-on-error: true
with:
extra_args: pip-compile --files ${{ steps.changed-files.outputs.all_changed_files }}
- if: ${{ env.PAT }}
uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: '[github-actions] pre-commit autoupdate'
- shell: bash
run: curl -s -S --retry 3 $BASEDIR/tests/install.sh | bash -
- shell: bash
run: curl -s -S --retry 3 $BASEDIR/tests/script.sh | bash -
# Don't install editable projects in the current working directory.
# https://pip.pypa.io/en/latest/reference/pip_install/#install-src
- run: pip install --src $GITHUB_WORKSPACE/../src -r requirements_dev.txt
- run: pip install -r requirements_dev.txt
- env:
# scrapyd is run as a command in production. scrapyd-client is run as a command for deployment.
STANDARD_MAINTENANCE_SCRIPTS_IGNORE: scrapyd,scrapyd-client
run: pytest /tmp/test_requirements.py
- run: shasum -c requirements.txt.sha256
7 changes: 3 additions & 4 deletions .github/workflows/nonlinux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@ jobs:
- name: Install postgresql (macOS)
if: matrix.os == 'macos-latest'
run: brew install postgresql
# Don't install editable projects in the current working directory.
# https://pip.pypa.io/en/latest/reference/pip_install/#install-src
- run: pip install --src $GITHUB_WORKSPACE/../src -r requirements_dev.txt
- run: pip install -r requirements_dev.txt
- env:
CI_SKIP: true
run: pytest -W error -rs --cov kingfisher_scrapy
# https://github.com/pytest-dev/pytest-twisted/issues/183
run: pytest -W error -W ignore::DeprecationWarning:pytest_twisted -rs --cov kingfisher_scrapy
- run: python test_delayed_request_middleware.py
23 changes: 23 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ci:
autoupdate_schedule: quarterly
skip: [pip-compile]
repos:
- repo: https://github.com/pycqa/flake8
rev: 7.1.0
hooks:
- id: flake8
additional_dependencies: [flake8-comprehensions]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.4.4
hooks:
- id: pip-compile
name: pip-compile requirements.in
args: [requirements.in, -o, requirements.txt]
- id: pip-compile
name: pip-compile requirements_dev.in
args: [requirements_dev.in, -o, requirements_dev.txt]
files: ^requirements(_dev)?\.(in|txt)$
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10
3 changes: 3 additions & 0 deletions kingfisher_scrapy/base_spiders/compressed_file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def parse(self, response):
if self.sample and number > self.sample:
break

if not file_info.file_size:
continue

filename = file_info.filename
basename = os.path.basename(filename)
if (
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/extensions/files_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def spider_closed(self, spider, reason):
message_length = math.ceil(len(message) / 2) * 2
title_length = message_length // 2 - 8

spider.logger.info(f"+-{'-' * title_length } DATA DIRECTORY {'-' * title_length }-+")
spider.logger.info(f"+-{'-' * title_length} DATA DIRECTORY {'-' * title_length}-+")
spider.logger.info(f"| {' ' * message_length} |")
spider.logger.info(f"| {message.ljust(message_length)} |")
spider.logger.info(f"| {' ' * message_length} |")
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class DataResource(Resource, arbitrary_types_allowed=True, use_enum_values=True)
@pydantic.validator('data', pre=True) # `pre` is needed to prevent pydantic from type casting
def check_data(cls, v):
# pydantic has no `condict()` to set `strict=True` or `min_properties=1`. pydantic/pydantic#1277
assert isinstance(v, (Data, bytes)), f'{v.__class__.__name__} is not a valid type'
assert isinstance(v, (Data, bytes)), f'{type(v).__name__} is not a valid type'
assert v, 'ensure this value is non-empty'
return v

Expand Down
5 changes: 3 additions & 2 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import ijson
import jsonpointer
from flattentool import unflatten
from flattentool.exceptions import FlattenToolWarning
from scrapy.exceptions import DropItem, NotSupported

from kingfisher_scrapy.items import File, FileItem, PluckedItem
Expand Down Expand Up @@ -56,7 +57,7 @@ def process_item(self, item, spider):

# Drop FileError items, so that we keep trying to get data.
if not isinstance(item, (File, FileItem)):
raise DropItem(f'Sample: Item is a {item.__class__.__name__}, not a File or FileItem')
raise DropItem(f'Sample: Item is a {type(item).__name__}, not a File or FileItem')
if self.item_count >= spider.sample:
spider.crawler.engine.close_spider(spider, 'sample')
raise DropItem('Sample: Maximum sample size reached')
Expand Down Expand Up @@ -172,7 +173,7 @@ def process_item(self, item, spider):
f.write(pkgutil.get_data('kingfisher_scrapy', f'schema/{spider.ocds_version}.json'))

with warnings.catch_warnings():
warnings.filterwarnings('ignore') # flattentool uses UserWarning, so we can't set a specific category
warnings.filterwarnings('ignore', category=FlattenToolWarning)

unflatten(
input_name,
Expand Down
22 changes: 22 additions & 0 deletions kingfisher_scrapy/spiders/chile_compra_api_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import abstractmethod
from datetime import date
from json import JSONDecodeError

from kingfisher_scrapy.base_spiders import IndexSpider, PeriodicSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
Expand Down Expand Up @@ -82,6 +83,27 @@ def parse_page(self, response):
def handle_item(self, item):
pass

# from IndexSpider
def parse_list_loader(self, response):
try:
data = response.json()
except JSONDecodeError:
yield self.build_file_error_from_response(
response, errors={'http_code': response.status, 'text': response.text}
)
return

# Some files contain invalid packages, e.g.:
# {
# "detail": "Error en la generación. ",
# "status": 500
# }
if set(data) == {'detail', 'status'}:
data['http_code'] = data['status']
return self.build_file_error_from_response(response, errors=data)

return data

# from IndexSpider
def url_builder(self, value, data, response):
# URL looks like http://api.mercadopublico.cl/APISOCDS/OCDS/listaOCDSAgnoMesTratoDirecto/2021/03/31500/100
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/mexico_nuevo_leon_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class MexicoNuevoLeonRecords(SimpleSpider):
Domain
Secretaría de Movilidad y Planeación Urbana de Nuevo León
Bulk download documentation
http://si.nl.gob.mx/transparencia/publicaciones
https://smpu.nl.gob.mx/transparencia/publicaciones
"""
name = 'mexico_nuevo_leon_records'

Expand All @@ -17,6 +17,6 @@ class MexicoNuevoLeonRecords(SimpleSpider):

def start_requests(self):
yield scrapy.Request(
'http://si.nl.gob.mx/siasi_ws/api/ocds/DescargarRecordPackage',
'https://smpu.nl.gob.mx/siasi_ws/api/ocds/DescargarRecordPackage',
meta={'file_name': 'records.json'}
)
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/mexico_nuevo_leon_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class MexicoNuevoLeonReleases(CompressedFileSpider):
Domain
Secretaría de Movilidad y Planeación Urbana de Nuevo León
Bulk download documentation
http://si.nl.gob.mx/transparencia/acerca-del-proyecto
https://smpu.nl.gob.mx/transparencia/acerca-del-proyecto
"""
name = 'mexico_nuevo_leon_releases'

Expand All @@ -22,5 +22,5 @@ class MexicoNuevoLeonReleases(CompressedFileSpider):
file_name_must_contain = 'ReleasePackage'

def start_requests(self):
url = 'http://si.nl.gob.mx/acceso/DatosAbiertos/JSONsInfraestructuraAbierta.rar'
url = 'https://smpu.nl.gob.mx/acceso/DatosAbiertos/JSONsInfraestructuraAbierta.rar'
yield scrapy.Request(url, meta={'file_name': 'all.rar'})
23 changes: 20 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
[project]
name = "kingfisher-collect"
version = "0.0.0"

[project.entry-points.scrapy]
settings = "kingfisher_scrapy.settings"

[tool.setuptools.packages.find]
exclude = ["tests", "tests.*"]

[tool.setuptools.package-data]
kingfisher_scrapy = ["schema/*.json"]

[tool.isort]
profile = "black"
line_length = 119

[tool.pytest.ini_options]
addopts = '--doctest-modules'
asyncio_mode = 'auto'
addopts = "--doctest-modules"
asyncio_mode = "auto"

[tool.coverage.run]
omit = ['*/kingfisher_scrapy/spiders/*']
omit = ["*/kingfisher_scrapy/spiders/*"]
36 changes: 19 additions & 17 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile
#
# This file was autogenerated by uv via the following command:
# uv pip compile requirements.in -o requirements.txt
attrs==22.2.0
# via
# automat
Expand Down Expand Up @@ -46,7 +42,7 @@ et-xmlfile==1.0.1
# via openpyxl
filelock==3.4.1
# via tldextract
flattentool==0.24.0
flattentool==0.26.0
# via -r requirements.in
hyperlink==21.0.0
# via twisted
Expand All @@ -60,7 +56,7 @@ ijson==3.2.3
# -r requirements.in
# flattentool
# ocdskit
incremental==22.10.0
incremental==24.7.2
# via twisted
itemadapter==0.7.0
# via
Expand All @@ -84,14 +80,14 @@ jsonref==1.0.1
# ocdsextensionregistry
# ocdskit
# ocdsmerge
lxml==4.9.2
lxml==5.3.0
# via
# flattentool
# parsel
# scrapy
ocdsextensionregistry==0.2.2
# via ocdskit
ocdskit[perf]==1.1.13
ocdskit==1.1.13
# via -r requirements.in
ocdsmerge==0.7.0
# via ocdskit
Expand Down Expand Up @@ -161,14 +157,23 @@ scrapy==2.11.2
# -r requirements.in
# scrapyd
# scrapyd-client
scrapyd==1.4.3
scrapyd==1.5.0b1
# via -r requirements.in
scrapyd-client==1.2.3
# via -r requirements.in
sentry-sdk==1.19.0
sentry-sdk==2.10.0
# via -r requirements.in
service-identity==24.1.0
# via scrapy
setuptools==74.1.1
# via
# incremental
# scrapy
# scrapyd
# zc-lockfile
# zc-zlibstorage
# zodbpickle
# zope-interface
six==1.16.0
# via
# automat
Expand All @@ -181,7 +186,7 @@ tldextract==3.1.2
# via scrapy
transaction==3.1.0
# via zodb
twisted==24.3.0
twisted==24.7.0rc1
# via
# -r requirements.in
# scrapy
Expand Down Expand Up @@ -209,7 +214,7 @@ w3lib==2.1.1
# scrapyd-client
xmltodict==0.12.0
# via flattentool
yapw[perf]==0.1.4
yapw==0.1.4
# via -r requirements.in
zc-lockfile==3.0.post1
# via zodb
Expand All @@ -233,6 +238,3 @@ zope-interface==6.0
# twisted
# zc-zlibstorage
# zodb

# The following packages are considered to be unsafe in a requirements file:
# setuptools
2 changes: 1 addition & 1 deletion requirements.txt.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
7c48c01584024f7ea7bd8b3ad63b6c15f64f76dddb0d271007f45b451af22cc2 requirements.txt
3f0f25b383baca3102034710cdf6abdc96d1efd027f9e83ede7781287d057d3f requirements.txt
4 changes: 0 additions & 4 deletions requirements_dev.in
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
-r requirements.txt
coverage[toml]
coveralls
flake8
isort
ocdsmerge
openpyxl
pip-tools
pika
psycopg2-binary
pytest
Expand Down
Loading

0 comments on commit 7d61e2e

Please sign in to comment.