From 99e6826ef63bda4449a01bbc7a9e94fb4ff309cd Mon Sep 17 00:00:00 2001
From: Pallab Pain <pallabkumarpain@gmail.com>
Date: Thu, 7 Nov 2024 17:04:16 +0530
Subject: [PATCH] fix(paramserver): file is too large error

This commit fixes a specific case when a json/yaml file is uploaded to
the paramserver and is stored by the backend in etcd. The issue existed
till date because the SDK performs a stat to check the size of a file
and determine whether to upload a file as blob or not. However, in cases
where the file size is close to the upper limit i.e. 128 KiB, the API
payload may inflate due to additional metadata and string conversion. In
such cases, while the file size may be under 128KiB, the final data to
be stored in etcd is greater than 128KiB and it throws a `file is too
large` error.

Fixes AB#33921
---
 rapyuta_io/clients/paramserver.py | 56 ++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/rapyuta_io/clients/paramserver.py b/rapyuta_io/clients/paramserver.py
index 7b7bf6be..ef6e56f9 100644
--- a/rapyuta_io/clients/paramserver.py
+++ b/rapyuta_io/clients/paramserver.py
@@ -3,6 +3,7 @@
 import enum
 import errno
 import hashlib
+import json
 import mimetypes
 import os
 import tempfile
@@ -151,10 +152,16 @@ def process_dir(self, executor, rootdir, tree_path, level, dir_futures, file_fut
                     future = executor.submit(self.create_binary_file, new_tree_path, full_path)
                 if file_name.endswith('.yaml'):
                     data = parse_yaml(full_path)
-                    future = executor.submit(self.create_file, new_tree_path, data)
+                    if self.should_upload_as_binary(data, self.yaml_content_type):
+                        future = executor.submit(self.create_binary_file, new_tree_path, full_path)
+                    else:
+                        future = executor.submit(self.create_file, new_tree_path, data)
                 elif file_name.endswith('.json'):
                     data = parse_json(full_path)
-                    future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type)
+                    if self.should_upload_as_binary(data, self.json_content_type):
+                        future = executor.submit(self.create_binary_file, new_tree_path, full_path)
+                    else:
+                        future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type)
                 else:
                     future = executor.submit(self.create_binary_file, new_tree_path, full_path)
                 file_futures[future] = new_tree_path
@@ -167,22 +174,45 @@ def process_folder(self, executor, rootdir, tree_path, level, dir_futures, file_
             if isdir(full_path):
                 future = executor.submit(self.create_folder, new_tree_path)
                 dir_futures[future] = (new_tree_path, level + 1)
-            else:
-                file_stat = os.stat(full_path)
-                file_name = os.path.basename(full_path)
-                if file_stat.st_size > self.max_non_binary_size:
+                continue
+            file_stat = os.stat(full_path)
+            file_name = os.path.basename(full_path)
+            if file_stat.st_size > self.max_non_binary_size:
+                future = executor.submit(self.create_binary_file, new_tree_path, full_path)
+            elif file_name.endswith('.yaml'):
+                data = parse_yaml(full_path)
+                if self.should_upload_as_binary(data, self.yaml_content_type):
                     future = executor.submit(self.create_binary_file, new_tree_path, full_path)
-                elif file_name.endswith('.yaml'):
-                    data = parse_yaml(full_path)
-                    future = executor.submit(self.create_file, new_tree_path, data)
-                elif file_name.endswith('.json'):
-                    data = parse_json(full_path)
-                    future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type)
                 else:
+                    future = executor.submit(self.create_file, new_tree_path, data)
+            elif file_name.endswith('.json'):
+                data = parse_json(full_path)
+                if self.should_upload_as_binary(data, self.json_content_type):
                     future = executor.submit(self.create_binary_file, new_tree_path, full_path)
-                file_futures[future] = new_tree_path
+                else:
+                    future = executor.submit(self.create_file, new_tree_path, data, content_type=self.json_content_type)
+            else:
+                future = executor.submit(self.create_binary_file, new_tree_path, full_path)
+            file_futures[future] = new_tree_path
+
         return dir_futures, file_futures
 
+    def should_upload_as_binary(self, filedata, content_type):
+        """Determines if the file should be uploaded as binary based on the file size
+
+        While the file data may be less than the supported limit, the combined size of
+        the API payload is what is stored by paramserver. This method calculates the
+        size of the payload and determines if it exceeds the supported limit. If it does,
+        the file is uploaded as a binary to the blob store.
+
+        We cannot entirely rely on the file stat result since the file data is sent as
+        a string in the API payload and the final size inflates when json.dumps is
+        performed on the final payload.
+        """
+        metadata_size_buffer = 100  # In bytes
+        payload = {'type': _Node.File, 'data': filedata, 'contentType': content_type}
+        return len(json.dumps(payload)) + metadata_size_buffer > self.max_non_binary_size
+
     def upload_configurations(self, rootdir, tree_names, delete_existing_trees, as_folder=False):
         self.validate_args(rootdir, tree_names, delete_existing_trees, as_folder)
         with futures.ThreadPoolExecutor(max_workers=15) as executor: