Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] scaling out spreadsheet and data files #263

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@ image: python:3.6

stages:
- dcp_wide_test
- 1000_cell_test
- 100_cell_test

before_script:
- apt-get -y update
- apt-get -y install jq
- pip install -r requirements.txt
- export CI_COMMIT_REF_NAME=integration
- export DEPLOYMENT_ENV=$CI_COMMIT_REF_NAME
- export SWAGGER_URL="https://dss.$DEPLOYMENT_ENV.data.humancellatlas.org/v1/swagger.json"
- mkdir -p ~/.config/hca
Expand Down Expand Up @@ -42,3 +41,10 @@ dcp_wide_test_optimus:
- staging
script:
- python -m unittest tests.integration.test_end_to_end_dcp.TestOptimusRun.test_optimus_run

dcp_wide_ss2_scale_test_4000:
stage: dcp_wide_test
only:
- scale-out-spreadsheet-and-files
script:
CI_COMMIT_REF_NAME=integration DEPLOYMENT_STAGE=integration python -m unittest tests.scale.test_big_bundles.TestBigBundles.test_one_submission_with_4000_bundles
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ iso8601
requests
urllib3
hca>=4.9.0
openpyxl
openpyxl==2.3.5
awscli
hca-ingest
cromwell-tools>=1.1.2
Expand Down
98 changes: 96 additions & 2 deletions tests/dataset_fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import os
import glob

import boto3
import openpyxl
import requests
from hca.util.pool import ThreadPool


class DatasetFixture:
Expand Down Expand Up @@ -35,6 +37,94 @@ def __init__(self, dataset_name, deployment):
self.config = json.load(json_data)
self.config["spreadsheet_location"] = self.config["spreadsheet_location"].replace("DEPLOYMENT", self.deployment)
self._download_spreadsheet()
if self.config["generate_scaled_spreadsheet"] == True:
self._generate_scaled_spreadsheet_and_data_files()

def _generate_scaled_spreadsheet_and_data_files(self):
self._scale_spreadsheet_cell_suspensions()
self._scale_sequence_files()

def _scale_spreadsheet_cell_suspensions(self):
cell_suspension_tab = self.spreadsheet['Cell suspension']
row_to_copy = self._fetch_row_with_headers(cell_suspension_tab, 6)
num_rows_to_copy = self.config["expected_bundle_count"]
for row_idx in range(2, num_rows_to_copy + 1):
new_row = row_to_copy.copy()
new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx))
new_row["cell_suspension.plate_based_sequencing.plate_label"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_label"] + 1
new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}"
cell_suspension_tab.append(list(new_row.values()))
self.spreadsheet.save(filename=self.metadata_spreadsheet_path)

def _scale_sequence_files(self):
sequence_file_tab = self.spreadsheet['Sequence file']
first_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 6)
second_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 7)
num_rows_to_copy = self.config["expected_bundle_count"]
orig_filename_1 = first_file_row_to_copy['sequence_file.file_core.file_name']
orig_filename_2 = second_file_row_to_copy['sequence_file.file_core.file_name']
pool = ThreadPool()
pool.add_task(self._copy_sequence_file, orig_filename_1, orig_filename_1)
pool.add_task(self._copy_sequence_file, orig_filename_2, orig_filename_2)
for row_idx in range(2, num_rows_to_copy + 1):
new_first_file_row = first_file_row_to_copy.copy()
new_second_file_row = second_file_row_to_copy.copy()
new_filename_1 = f"{row_idx}_{orig_filename_1}"
new_filename_2 = f"{row_idx}_{orig_filename_2}"
new_first_file_row["sequence_file.file_core.file_name"] = new_filename_1
new_second_file_row["sequence_file.file_core.file_name"] = new_filename_2
new_first_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = first_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx))
new_second_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = second_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx))
new_first_file_row["process.process_core.process_id"] = row_idx
new_second_file_row["process.process_core.process_id"] = row_idx
sequence_file_tab.append(list(new_first_file_row.values()))
sequence_file_tab.append(list(new_second_file_row.values()))
pool.add_task(self._copy_sequence_file, orig_filename_1, new_filename_1)
pool.add_task(self._copy_sequence_file, orig_filename_2, new_filename_2)
pool.wait_for_completion()
self.spreadsheet.save(filename=self.metadata_spreadsheet_path)

def _copy_sequence_file(self, source_file_name, target_file_name):
s3_client = boto3.client('s3')
source_s3_prefix = self.config["orig_copy_files_location"]
source_s3_path = f"{source_s3_prefix}{source_file_name}"
s3_path_split = source_s3_path.replace("s3://", "").split("/", 1)
source_bucket = s3_path_split[0]
source_key = s3_path_split[1]

target_s3_prefix = self.config["data_files_location"]
target_s3_path = f"{target_s3_prefix}{target_file_name}"
s3_path_split = target_s3_path.replace("s3://", "").split("/", 1)
target_bucket = s3_path_split[0]
target_key = s3_path_split[1]

copy_source = {
'Bucket': source_bucket,
'Key': source_key
}
upload_args = {
'CopySource': copy_source,
'Bucket': target_bucket,
'Key': target_key
}
print(f"copying {source_s3_path} to {target_s3_path}")
s3_client.copy(**upload_args)

def _fetch_row_with_headers(self, worksheet, row_idx):
row = {}
headers = self._fetch_row_values(worksheet, "A4:AG4")
value_idxs = f"A{row_idx}:AG{row_idx}"
values = self._fetch_row_values(worksheet, value_idxs)
for idx, val in enumerate(headers):
row[val] = values[idx]
return row

def _fetch_row_values(self, ws, n):
values = []
for row in ws.iter_rows(n):
for cell in row:
values.append(cell.value)
return values

def _download_spreadsheet(self):
response = requests.get(self.config["spreadsheet_location"])
Expand Down Expand Up @@ -63,7 +153,11 @@ def count_of_rows_in_spreadsheet_tab(self, tab_name, header_rows=5):
ws = self.spreadsheet[tab_name]
rows_with_content = 0
row = header_rows + 1
while ws.cell(row=row, column=1).value:
rows_with_content += 1
extra_rows_to_check = 10
while extra_rows_to_check > 0:
if ws.cell(row=row, column=1).value:
rows_with_content += 1
else:
extra_rows_to_check -= 1
row += 1
return rows_with_content
Binary file modified tests/fixtures/.DS_Store
Binary file not shown.
5 changes: 0 additions & 5 deletions tests/fixtures/datasets/gliob_100/README.json

This file was deleted.

5 changes: 0 additions & 5 deletions tests/fixtures/datasets/gliob_1000/README.json

This file was deleted.

5 changes: 0 additions & 5 deletions tests/fixtures/datasets/gliob_200/README.json

This file was deleted.

5 changes: 0 additions & 5 deletions tests/fixtures/datasets/gliob_400/README.json

This file was deleted.

7 changes: 7 additions & 0 deletions tests/fixtures/datasets/ss2_100/README.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-100/",
"spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
"expected_bundle_count": 100,
"generate_scaled_spreadsheet": true,
"orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
}
7 changes: 7 additions & 0 deletions tests/fixtures/datasets/ss2_1000/README.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-1000/",
"spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
"expected_bundle_count": 1000,
"generate_scaled_spreadsheet": true,
"orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
}
7 changes: 7 additions & 0 deletions tests/fixtures/datasets/ss2_2000/README.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-2000/",
"spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
"expected_bundle_count": 2000,
"generate_scaled_spreadsheet": true,
"orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
}
7 changes: 7 additions & 0 deletions tests/fixtures/datasets/ss2_4000/README.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-4000/",
"spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
"expected_bundle_count": 4000,
"generate_scaled_spreadsheet": true,
"orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
}
16 changes: 8 additions & 8 deletions tests/scale/test_big_bundles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
class TestBigBundles(unittest.TestCase):

def test_one_submission_with_100_bundles(self):
self._run(fixture_name='gliob_100')
self._run(fixture_name='ss2_100')

def test_one_submission_with_200_bundles(self):
self._run(fixture_name='gliob_200')
def test_one_submission_with_1000_bundles(self):
self._run(fixture_name='ss2_1000')

def test_one_submission_with_400_bundles(self):
self._run(fixture_name='gliob_400')
def test_one_submission_with_2000_bundles(self):
self._run(fixture_name='ss2_2000')

def test_one_submission_with_1000_bundles(self):
self._run(fixture_name='gliob_1000')
def test_one_submission_with_4000_bundles(self):
self._run(fixture_name='ss2_4000')

def test_ingest_and_upload_only_1000_bundle_submissions(self):
self._run(fixture_name='gliob_1000', export_bundles=False)
self._run(fixture_name='ss2_1000', export_bundles=False)

def _run(self, fixture_name, export_bundles=True):
print("")
Expand Down