From 3f379eef86e3fd8aa9e77fab44f5df5fae519a59 Mon Sep 17 00:00:00 2001 From: Pallab Pain Date: Tue, 12 Nov 2024 16:06:17 +0530 Subject: [PATCH] fix(paramserver): file is too large error (#98) This commit fixes a specific case when a json/yaml file is uploaded to the paramserver and is stored by the backend in etcd. The issue existed till date because the SDK performs a stat to check the size of a file and determine whether to upload a file as blob or not. However, in cases where the file size is close to the upper limit i.e. 128 KiB, the API payload may inflate due to additional metadata and string conversion. In such cases, while the file size may be under 128KiB, the final data to be stored in etcd is greater than 128KiB and it throws a `file is too large` error. Fixes AB#33921 --- rapyuta_io/clients/paramserver.py | 56 ++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/rapyuta_io/clients/paramserver.py b/rapyuta_io/clients/paramserver.py index 7b7bf6be..3897acf4 100644 --- a/rapyuta_io/clients/paramserver.py +++ b/rapyuta_io/clients/paramserver.py @@ -3,6 +3,7 @@ import enum import errno import hashlib +import json import mimetypes import os import tempfile @@ -151,10 +152,16 @@ def process_dir(self, executor, rootdir, tree_path, level, dir_futures, file_fut future = executor.submit(self.create_binary_file, new_tree_path, full_path) if file_name.endswith('.yaml'): data = parse_yaml(full_path) - future = executor.submit(self.create_file, new_tree_path, data) + if self.should_upload_as_binary(data, self.yaml_content_type): + future = executor.submit(self.create_binary_file, new_tree_path, full_path) + else: + future = executor.submit(self.create_file, new_tree_path, data) elif file_name.endswith('.json'): data = parse_json(full_path) - future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type) + if self.should_upload_as_binary(data, self.json_content_type): + future = executor.submit(self.create_binary_file, new_tree_path, full_path) + else: + future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type) else: future = executor.submit(self.create_binary_file, new_tree_path, full_path) file_futures[future] = new_tree_path @@ -167,22 +174,45 @@ def process_folder(self, executor, rootdir, tree_path, level, dir_futures, file_ if isdir(full_path): future = executor.submit(self.create_folder, new_tree_path) dir_futures[future] = (new_tree_path, level + 1) - else: - file_stat = os.stat(full_path) - file_name = os.path.basename(full_path) - if file_stat.st_size > self.max_non_binary_size: + continue + file_stat = os.stat(full_path) + file_name = os.path.basename(full_path) + if file_stat.st_size > self.max_non_binary_size: + future = executor.submit(self.create_binary_file, new_tree_path, full_path) + elif file_name.endswith('.yaml'): + data = parse_yaml(full_path) + if self.should_upload_as_binary(data, self.yaml_content_type): future = executor.submit(self.create_binary_file, new_tree_path, full_path) - elif file_name.endswith('.yaml'): - data = parse_yaml(full_path) - future = executor.submit(self.create_file, new_tree_path, data) - elif file_name.endswith('.json'): - data = parse_json(full_path) - future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type) else: + future = executor.submit(self.create_file, new_tree_path, data) + elif file_name.endswith('.json'): + data = parse_json(full_path) + if self.should_upload_as_binary(data, self.json_content_type): future = executor.submit(self.create_binary_file, new_tree_path, full_path) - file_futures[future] = new_tree_path + else: + future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type) + else: + future = executor.submit(self.create_binary_file, new_tree_path, full_path) + file_futures[future] = new_tree_path + return dir_futures, file_futures + def should_upload_as_binary(self, filedata, content_type): + """Determines if the file should be uploaded as binary based on the file size + + While the file data may be less than the supported limit, the combined size of + the API payload is what is stored by paramserver. This method calculates the + size of the payload and determines if it exceeds the supported limit. If it does, + the file is uploaded as a binary to the blob store. + + We cannot entirely rely on the file stat result since the file data is sent as + a string in the API payload and the final size inflates when json.dumps is + performed on the final payload. + """ + metadata_size_buffer = 200 # In bytes + payload = {'type': _Node.File, 'data': filedata, 'contentType': content_type} + return len(json.dumps(payload)) + metadata_size_buffer > self.max_non_binary_size + def upload_configurations(self, rootdir, tree_names, delete_existing_trees, as_folder=False): self.validate_args(rootdir, tree_names, delete_existing_trees, as_folder) with futures.ThreadPoolExecutor(max_workers=15) as executor: