From c11e810c6491fdf176ff03a174bea49f4ccbbdf5 Mon Sep 17 00:00:00 2001 From: Steven Winship <39765413+stevenwinship@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:39:51 -0400 Subject: [PATCH] fix secret config load and add clean-up for re-run (#1) * fix secret config load and add clean-up for re-run * added delete old data from datacite and updated readme * fix readme --- README.md | 11 ++++++++--- config/config.py | 28 +++++++++++++++++++++++++--- main.py | 4 ++++ upload/upload.py | 16 ++++++++++++++-- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1f826cb..39c0f28 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ python installed, you can use `pip` and `python` as shown in the examples. The software assumes you area already logging your COUNTER dataset *investigations* and *requests* to a log file using a format somewhat similar to extended log format. The COUNTER Code of Practice requires that descriptive metadata be submitted along with statistics--these items are included in logs to ease later processing. -Log items are separated by tabs (\t) and any missing values may be logged with a dash (-) or by a an empty string. +Log items are separated by tabs (\t) and any missing values may be logged with a dash (-) or by an empty string. ## Go through an example of installing and running the script @@ -44,7 +44,7 @@ with some examples. - Publication Date (ISO8601 format) - Version - Other/Alternate ID -- Target URL that the itentifer such as the DOi would resolve to +- Target URL that the identifier such as the DOi would resolve to - Year of Publication ## Overview of processing logs @@ -98,6 +98,7 @@ If you don't set a CONFIG_FILE the script will use the one at *config/config.yam - **simulate_date**: put in a yyyy-mm-dd date to simulate running a report on that specified year month and day. Normally the script will process logs and create data output through the previous day based on the system time. A report run for a month after a reporting period is over will process things up to the end of that reporting month as specified by year_month. Setting this allows simulating a run on a different day and is mostly for testing. See information about how state is maintained in the section below to understand what happens when specifying a different date. The processor expects an orderly processing of logs in chronological order such as running nightly or weekly. - **maxmind\_geoip\_country\_path**: set the path to the GeoLite2-Country.mmdb binary geolocation database file. You may need to periodically download updates to this file from MaxMind. - **output\_volume**: set to True if you'd like volume (file size) information output in the report. This option is currently not supported when submitting reports to the hub. +- **clean\_for\_rerun**: set to True to force the deletion of existing data. This option should not be included in the config file but can be passed in when re-running an already processed set of logs files. ## Maintaining State Between Runs @@ -113,7 +114,7 @@ There is also an id key for each month which indicates the identifier returned b The state allows data to be added to the database from the logs, for example each night, without reprocessing every log for the month every night. -For example, if the script is run on May 2nd, and for a May 2018 report, it woould process the log file for May 1st and put entries in the 2018-05 database for that log file (from which stats can be calculated). +For example, if the script is run on May 2nd, and for a May 2018 report, it would process the log file for May 1st and put entries in the 2018-05 database for that log file (from which stats can be calculated). If run again on May 3rd, it would only need to process the May 2nd log into the database because May 1st has already been processed. @@ -130,6 +131,10 @@ If you wish to completely reprocess and submit a month's data from log files you 3. Remove the appropriate month's sqlite database from the file system 4. Reprocess the month. If it's after the month, use *year_month* for the months report you'd like. +These 4 steps can be automatically done by passing *clean_for_rerun=True*. Note: If a report id exists in the state file the DELETE request will be called regardless of the *upload_to_hub* flag + +```CLEAN_FOR_RERUN=True ./main.py``` + It might also be important to understand how state works if moving the script to a different system so that you maintain the state files as needed. diff --git a/config/config.py b/config/config.py index 51d449d..c71666e 100644 --- a/config/config.py +++ b/config/config.py @@ -1,6 +1,8 @@ import yaml import os from models import * +from upload import upload +import logging import input_processor as ip import output_processor as op import sys @@ -19,11 +21,12 @@ class _Config: ALLOWED_ENV = ('LOG_NAME_PATTERN', 'ROBOTS_URL', 'MACHINES_URL', 'YEAR_MONTH', 'OUTPUT_FILE', 'PLATFORM', 'HUB_API_TOKEN', 'HUB_BASE_URL', 'UPLOAD_TO_HUB', - 'SIMULATE_DATE', 'MAXMIND_GEOIP_COUNTRY_PATH', 'OUTPUT_VOLUME') + 'SIMULATE_DATE', 'MAXMIND_GEOIP_COUNTRY_PATH', 'OUTPUT_VOLUME', 'CLEAN_FOR_RERUN') + logging.basicConfig(format='%(message)s', level=logging.INFO) # thismodule = sys.modules[__name__] # not sure this is needed - def __init__(self): + self.log = logging.getLogger(__name__) # things that come from the configuration file self.robots_reg = None self.machines_reg = None @@ -40,6 +43,7 @@ def __init__(self): self.simulate_date = None self.maxmind_geoip_country_path = None self.output_volume = None + self.clean_for_rerun = None # things that are stored or calculated separately self.start_date = None @@ -72,7 +76,7 @@ def __init__(self): secret = os.path.join(os.path.dirname(self.config_file), 'secrets.yaml') if os.path.isfile(secret) == True: with open(secret, 'r') as ymlfile: - cfg = yaml.load(ymlfile) + cfg = yaml.safe_load(ymlfile) for x in cfg: setattr(self, x, cfg[x]) @@ -88,6 +92,8 @@ def __init__(self): if isinstance(self.output_volume, str): self.output_volume = (self.output_volume.lower() == 'true') + if isinstance(self.clean_for_rerun, str): + self.clean_for_rerun = (self.clean_for_rerun.lower() == 'true') # simulate date, in case someone wants to simulate running on a day besides now if self.simulate_date is not None: @@ -238,6 +244,22 @@ def filenames_to_process(self): return [ self.log_name_pattern.replace('(yyyy-mm-dd)', self.year_month + '-' + ("%02d" % x)) for x in range(to_process_from, ld + 1) ] + def delete_log_processed_date(self): + # clean up data for this period, so it can be re-run + if self.year_month in self.state_dict: + if 'id' in self.state_dict[self.year_month]: + upload.delete_from_datacite(self.state_dict[self.year_month]['id']) + self.log.info(f"Removing state: {self.year_month}") + # remove the info from the state json + self.state_dict.pop(self.year_month) + # delete the specific database for this time period + my_file = f'state/counter_db_{self.year_month}.sqlite3' + if os.path.exists(my_file): + self.log.info(f"Deleting file: {my_file}") + os.remove(my_file) + with open('state/statefile.json', 'w') as f: + json.dump(self.state_dict, f, sort_keys = True, indent = 4, ensure_ascii=False) + def update_log_processed_date(self): if self.year_month in self.state_dict: self.state_dict[self.year_month]['last_processed_day'] = int(self.last_day().split('-')[2]) diff --git a/main.py b/main.py index d8a3299..13ae9fe 100755 --- a/main.py +++ b/main.py @@ -16,6 +16,10 @@ def main(): else: DbActions.vacuum() # cleans up DB indices for speed + # if re-running a particular month then remove the db and entry in the state file + if config.Config().clean_for_rerun == True: + config.Config().delete_log_processed_date() + the_filenames = config.Config().filenames_to_process() diff --git a/upload/upload.py b/upload/upload.py index 24f877a..238280f 100644 --- a/upload/upload.py +++ b/upload/upload.py @@ -71,7 +71,8 @@ def send_to_datacite(): save_response(response) json_data = json.loads(response.text) if 'report' in json_data: - config.Config().write_id(json_data['report']['id']) + my_id = json_data['report']['id'] + config.Config().write_id(my_id) else: my_url = urljoin(config.Config().hub_base_url, f'reports/{pathname2url(my_id)}') # response = requests.put(my_url, data=data.encode("utf-8"), headers=headers) @@ -83,4 +84,15 @@ def send_to_datacite(): print("Expected to get 200 range status code when sending the report to the hub. Check tmp/datacite_response_body.txt for response.") sys.exit(1) else: - print('submitted') + print(f'Submitted ID: {my_id}') + +def delete_from_datacite(id): + headers = { + 'Authorization': f'Bearer {config.Config().hub_api_token}' + } + my_url = urljoin(config.Config().hub_base_url, f'reports/{pathname2url(id)}') + response = retry_if_500(method='delete', url=my_url, data='', headers=headers) + if response.status_code < 200 or response.status_code > 299: + print(f'Delete ID: {id}. Expected to get 204, but got code {response.status_code}') + else: + print(f'Deleted ID: {id}')