Skip to content

Commit

Permalink
Improve upload files performance (#190)
Browse files Browse the repository at this point in the history
Improves listing of files to upload:
- replaces directories with files located in these directories recursively to avoid sending directories (there were sent as packages compressed in memory)
- removes duplicated files to minimize bandwidth usage
  • Loading branch information
szymon-kuklewicz authored Nov 20, 2019
1 parent 95e6569 commit 9da34a7
Showing 1 changed file with 28 additions and 2 deletions.
30 changes: 28 additions & 2 deletions neptune/internal/storage/storage_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ def __ne__(self, other):
"""
return not self == other

def __hash__(self):
"""
Returns the hash of source and target path
"""
return hash((self.source_path, self.target_path))

def to_str(self):
"""
Returns the string representation of the model
Expand Down Expand Up @@ -93,10 +99,30 @@ def __repr__(self):
return self.to_str()


def scan_unique_upload_entries(upload_entries):
"""
Returns upload entries for all files that could be found for given upload entries.
In case of directory as upload entry, files we be taken from all subdirectories recursively.
Any duplicated entries are removed.
"""
walked_entries = set()
for entry in upload_entries:
if os.path.isdir(entry.source_path):
for root, _, files in os.walk(entry.source_path):
path_relative_to_entry_source = os.path.relpath(root, entry.source_path)
target_root = os.path.normpath(os.path.join(entry.target_path, path_relative_to_entry_source))
for filename in files:
walked_entries.add(UploadEntry(os.path.join(root, filename), os.path.join(target_root, filename)))
else:
walked_entries.add(entry)
return walked_entries


def split_upload_files(upload_entries, max_package_size=1 * 1024 * 1024, max_files=500):
current_package = UploadPackage()

for entry in upload_entries:
size = 0 if os.path.isdir(entry.source_path) else os.path.getsize(entry.source_path)
size = os.path.getsize(entry.source_path)

if (size + current_package.size > max_package_size or current_package.len > max_files) \
and not current_package.is_empty():
Expand All @@ -112,7 +138,7 @@ def normalize_file_name(name):


def upload_to_storage(upload_entries, upload_api_fun, upload_tar_api_fun, **kwargs):
for package in split_upload_files(upload_entries):
for package in split_upload_files(scan_unique_upload_entries(upload_entries)):
if package.is_empty():
break

Expand Down

0 comments on commit 9da34a7

Please sign in to comment.