From 451c065c001aa686a36d98713832c771d2377427 Mon Sep 17 00:00:00 2001 From: Jakub Date: Fri, 28 Jun 2019 19:18:59 +0200 Subject: [PATCH] Data and file versions (#36) * added versioning for file and directory * automatic list of all files with given extensions from dir and subdirs added. Helps with tracking source code with upload_source_files in neptune.create_experiment() * fixed formatting --- neptunecontrib/api/utils.py | 35 ++++++++++- neptunecontrib/versioning/__init__.py | 15 +++++ neptunecontrib/versioning/data.py | 90 +++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 neptunecontrib/versioning/__init__.py create mode 100644 neptunecontrib/versioning/data.py diff --git a/neptunecontrib/api/utils.py b/neptunecontrib/api/utils.py index 2c2175a..e34c875 100644 --- a/neptunecontrib/api/utils.py +++ b/neptunecontrib/api/utils.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import os import warnings import pandas as pd @@ -196,6 +196,39 @@ def strip_prefices(columns, prefices): return new_columns +def get_filepaths(dirpath='.', extensions=None): + """Filters leaderboard columns to get the system column names. + + Args: + dirpath(str): Folder from which all files with given extensions should be added to list. + extensions(list(str) or None): All extensions with which files should be added to the list. + + Returns: + list: A list of filepaths with given extensions that are in the directory or subdirecotries. + + Examples: + Initialize Neptune + + >>> import neptune + >>> from neptunecontrib.versioning.data import log_data_version + >>> neptune.init('USER_NAME/PROJECT_NAME') + + Create experiment and track all .py files from given directory and subdirs: + + >>> with neptune.create_experiment(upload_source_files=get_filepaths(extensions=['.py'])): + >>> neptune.send_metric('score', 0.97) + + """ + if not extensions: + extensions = ['.py', '.yaml', 'yml'] + files = [] + for r, _, f in os.walk(dirpath): + for file in f: + if any(file.endswith(ext) for ext in extensions): + files.append(os.path.join(r, file)) + return files + + def _prep_time_column(progress_df): progress_df['timestamp'] = pd.to_datetime(progress_df['timestamp']) progress_df.sort_values('timestamp', inplace=True) diff --git a/neptunecontrib/versioning/__init__.py b/neptunecontrib/versioning/__init__.py new file mode 100644 index 0000000..62a86a5 --- /dev/null +++ b/neptunecontrib/versioning/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright (c) 2019, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/neptunecontrib/versioning/data.py b/neptunecontrib/versioning/data.py new file mode 100644 index 0000000..89b7cd8 --- /dev/null +++ b/neptunecontrib/versioning/data.py @@ -0,0 +1,90 @@ +# +# Copyright (c) 2019, Neptune Labs Sp. z o.o. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import hashlib + +import neptune + + +def log_data_version(filepath, prefix='', experiment=None): + """Logs data version to Neptune + + For a path it calculates the hash and logs it along with the path itself as a property to Neptune experiment. + Path to dataset, which can be a file or directory. + + Args: + filepath(str): path to the file or directory, + prefix(str): Prefix that will be added before 'ata_version' and 'data_path' + experiment(neptune.experiemnts.Experiment or None): if the data should be logged to a particular + neptune experiment it can be passed here. By default it is logged to the current experiment. + + Examples: + Initialize Neptune + + >>> import neptune + >>> from neptunecontrib.versioning.data import log_data_version + >>> neptune.init('USER_NAME/PROJECT_NAME') + + Log data from filepath + + >>> FILEPATH = '/path/to/data/my_data.csv' + >>> with neptune.create_experiment(): + >>> log_data_version(FILEPATH) + + """ + + _exp = experiment if experiment else neptune + + _exp.set_property('{}data_path'.format(prefix), filepath) + _exp.set_property('{}data_version'.format(prefix), _md5_hash_path(filepath)) + + +def _md5_hash_path(path): + if os.path.isdir(path): + return _md5_hash_dir(path) + elif os.path.isfile(path): + return _md5_hash_file(path) + else: + raise NotImplementedError + + +def _md5_hash_file(filepath): + hash_md5 = hashlib.md5() + hash_md5 = _update_hash_md5(hash_md5, filepath) + return hash_md5.hexdigest() + + +def _md5_hash_dir(dirpath): + hash_md5 = hashlib.md5() + + for root, _, files in os.walk(dirpath): + for names in files: + filepath = os.path.join(root, names) + + # Hash the path and add to the digest to account for empty files/directories + hash_md5.update(hashlib.sha1(filepath[len(dirpath):].encode()).digest()) + + if os.path.isfile(filepath): + hash_md5 = _update_hash_md5(hash_md5, filepath) + + return hash_md5.hexdigest() + + +def _update_hash_md5(hash_md5, filepath): + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5