Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Create Validation step #10

Merged
merged 2 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,5 @@ dmypy.json
*.pyc
yourls.sql.gz
/namespaces/
bin/
pyvenv.cfg
40 changes: 40 additions & 0 deletions tests/data/namespaces/iow/links__0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

<?xml version='1.0' encoding='utf-8'?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://geoconnex.us/iow/homepage</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/aboutus</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/demo</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/nldi</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/nldi/demo</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/map</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/cr-demo</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/tracker</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/hyriver</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/principles</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/api-mockups</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url><url>
<loc>https://geoconnex.us/iow/guidance</loc>
<lastmod>2023-09-01T05:34:10Z</lastmod>
</url></ns0:urlset>
3 changes: 3 additions & 0 deletions tests/data/namespaces/iow/test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id, target, creator, description
https://geoconnexus/iow/homepage, https://internetofwater.org, [email protected], Internet Of Water homepage
https://geoconnex.us//usgs/monitoring-location/([a-zA-Z0-9_]+).*$, https://waterdata.usgs.gov/monitoring-location/$1, [email protected], USGS Monitoring Locations
2,401 changes: 2,401 additions & 0 deletions tests/data/namespaces/ref/hu08/hu08.csv

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion yourls-action/yourls_action/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

import click

from yourls_action.cli import run
from yourls_action.cli import run, validate


@click.group()
Expand All @@ -42,3 +42,4 @@ def cli():


cli.add_command(run)
cli.add_command(validate)
69 changes: 54 additions & 15 deletions yourls-action/yourls_action/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,31 @@ def url_join(*parts):

class yourls:
# https://stackoverflow.com/questions/60286623/python-loses-connection-to-mysql-database-after-about-a-day
mysql.connector.connect(
host=os.environ.get('YOURLS_DB_HOST') or 'mysql',
user=os.environ.get('YOURLS_DB_USER') or 'root',
password=os.environ.get('YOURLS_DB_PASSWORD') or 'arootpassword',
database="yourls",
pool_name="yourls_loader",
pool_size=3
)
try:
mysql.connector.connect(
host=os.environ.get('YOURLS_DB_HOST') or 'mysql',
user=os.environ.get('YOURLS_DB_USER') or 'root',
password=os.environ.get('YOURLS_DB_PASSWORD') or 'arootpassword',
database="yourls",
pool_name="yourls_loader",
pool_size=3
)
connection = True
except mysql.connector.errors.DatabaseError as err:
print(f'No SQL connection found: {err}')
connection = False

def __init__(self, **kwargs):
self.kwargs = kwargs
mydb, cursor = connection()
sql_statement = 'DELETE FROM yourls_url WHERE ip = "0.0.0.0"'
cursor.execute(sql_statement)
mydb.commit()
print(cursor.rowcount, "was deleted.")
cursor.close()
mydb.close()
self.history = set()
if self.connection:
mydb, cursor = connection()
sql_statement = 'DELETE FROM yourls_url WHERE ip = "0.0.0.0"'
cursor.execute(sql_statement)
mydb.commit()
print(cursor.rowcount, "was deleted.")
cursor.close()
mydb.close()

def _check_kwargs(self, keys):
"""
Expand Down Expand Up @@ -163,6 +170,38 @@ def handle_csv(self, file):
for chunk in chunky_parsed:
self.post_mysql(file, chunk)

def _validate_csvs(self, files):
"""
Splits list of csv files into individual csv files.
:param files: required, string. URL to be shortened.
"""
for f in files:
self.validate_csv(f)

def validate_csv(self, file):
"""
Parses and validates CSV file.
:param file: required, name of csv to be shortened
"""
if isinstance(file, list):
self._validate_csvs(file)
return

parsed_csv = self.parse_csv(file)

chunky_parsed = self.chunkify(parsed_csv, 1)
uri_stem = self.kwargs['uri_stem']
for _ in chunky_parsed:
chunk = _.strip().split(',')
[pid_, target_] = chunk[:2]
if pid_ in self.history:
print(f'Duplicate IRI detected at {uri_stem}{pid_}')
exit(1)
else:
self.history.add(pid_)

def parse_csv(self, filename):
"""
Parse CSV file into yourls-friendly csv.
Expand Down
20 changes: 20 additions & 0 deletions yourls-action/yourls_action/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,25 @@ def run(ctx, **kwargs):
urls.handle_csv(walk_path(p))


@click.command()
@click.pass_context
@click.argument('path', type=str, nargs=-1)
@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/',
help='uri stem to be removed from short url for keyword')
@click.option('-k', '--keyword', type=str, default='id',
help='field in CSV to be used as keyword')
@click.option('-l', '--long_url', type=str, default='target',
help='field in CSV to be used as long url')
@click.option('-t', '--title', type=str, default='description',
help='field in CSV to be used as title')
def validate(ctx, **kwargs):
urls = yourls(**kwargs)
for p in kwargs['path']:
if p.endswith(CSV):
urls.validate_csv(p)
else:
urls.validate_csv(walk_path(p))


if __name__ == "__main__":
run()
Loading