Skip to content

Commit

Permalink
Merge pull request #16 from datascience/dev
Browse files Browse the repository at this point in the history
Fixed upload scripts
  • Loading branch information
artourkin authored Oct 25, 2024
2 parents 6de9830 + 362652b commit ee6edc8
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ docker-compose -f docker-compose.dev.yaml up --build

File uploading using bash:
```
bash ./utils/fileupload.sh http://localhost:8082 ~/rnd/data/govdocs_fits/govdocs1/000/
bash ./utils/fileupload.sh http://localhost:8082 ~/path/to/files collection_name
```

File uploading using python (pip package requests in necessary):
```
python ./utils/fileupload.py http://localhost:8082/multipleupload ~/rnd/data/govdocs_fits/govdocs1/000/ 100 3
python ./utils/fileupload.py http://localhost:8082/multipleupload ~/path/to/files 100 3 collection_name
```

## Issues
Expand Down
14 changes: 7 additions & 7 deletions utils/fileupload.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def upload_chunk(url, chunk_files, chunk_count):

return end_time - start_time

def upload_files_in_chunks_parallel(url, folder_path, chunk_size=100, num_parallel_requests=10):
def upload_files_in_chunks_parallel(url, folder_path, chunk_size=100, num_parallel_requests=10, collection_name="dataset"):
headers = {
'accept': '*/*',
}
Expand All @@ -37,7 +37,7 @@ def upload_files_in_chunks_parallel(url, folder_path, chunk_size=100, num_parall

if len(chunk_files) == chunk_size:
with ThreadPoolExecutor(max_workers=num_parallel_requests) as executor:
future = executor.submit(upload_chunk, url, chunk_files, (chunk_count+1)*chunk_size)
future = executor.submit(upload_chunk, url + f"?datasetName={collection_name}", chunk_files, (chunk_count+1)*chunk_size)
duration = future.result()

total_duration += duration
Expand All @@ -47,25 +47,25 @@ def upload_files_in_chunks_parallel(url, folder_path, chunk_size=100, num_parall

if chunk_files:
with ThreadPoolExecutor(max_workers=num_parallel_requests) as executor:
future = executor.submit(upload_chunk, url, chunk_files, chunk_count)
future = executor.submit(upload_chunk, url + f"?datasetName={collection_name}", chunk_files, chunk_count)
duration = future.result()
total_duration += duration

return total_duration

if __name__ == "__main__":
if len(sys.argv) != 5:
print("Usage: python script.py <upload_url> <folder_path> <chunk_size> <num_parallel_requests>")
if len(sys.argv) != 6:
print("Usage: python script.py <upload_url> <folder_path> <chunk_size> <num_parallel_requests> <collection_name>")
sys.exit(1)

upload_url = sys.argv[1]
folder_to_upload = sys.argv[2]
chunk_size = int(sys.argv[3])
num_parallel_requests = int(sys.argv[4])

collection_name = sys.argv[5]
start_script_time = time.time()

total_duration = upload_files_in_chunks_parallel(upload_url, folder_to_upload, chunk_size, num_parallel_requests)
total_duration = upload_files_in_chunks_parallel(upload_url, folder_to_upload, chunk_size, num_parallel_requests, collection_name)

end_script_time = time.time()
script_duration = end_script_time - start_script_time
Expand Down
2 changes: 1 addition & 1 deletion utils/fileupload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ do
(( COUNTER++ ))
(( t=t%PARALLEL_TASKS )); ((t++==0)) && wait
curl --silent --output /dev/null --show-error -X "POST" \
"${1}/upload" \
"${1}/upload?datasetName=${3}" \
-H "accept: */*" \
-H "Content-Type: multipart/form-data" \
-F "file=@${i}" &
Expand Down

0 comments on commit ee6edc8

Please sign in to comment.