From bf79eb9ba356f8f1bec04bb733a2a98073c73bcb Mon Sep 17 00:00:00 2001 From: parthshahva Date: Tue, 29 Jan 2019 14:36:21 -0800 Subject: [PATCH 1/6] wip commit for scaling out spreadsheet and data files --- tests/dataset_fixture.py | 98 +++++++++++++++++- tests/fixtures/.DS_Store | Bin 6148 -> 6148 bytes tests/fixtures/datasets/gliob_100/README.json | 5 - .../fixtures/datasets/gliob_1000/README.json | 5 - tests/fixtures/datasets/gliob_200/README.json | 5 - tests/fixtures/datasets/gliob_400/README.json | 5 - tests/fixtures/datasets/ss2_100/README.json | 7 ++ tests/fixtures/datasets/ss2_1000/README.json | 7 ++ tests/fixtures/datasets/ss2_4000/README.json | 7 ++ tests/scale/test_big_bundles.py | 15 ++- 10 files changed, 123 insertions(+), 31 deletions(-) delete mode 100644 tests/fixtures/datasets/gliob_100/README.json delete mode 100644 tests/fixtures/datasets/gliob_1000/README.json delete mode 100644 tests/fixtures/datasets/gliob_200/README.json delete mode 100644 tests/fixtures/datasets/gliob_400/README.json create mode 100644 tests/fixtures/datasets/ss2_100/README.json create mode 100644 tests/fixtures/datasets/ss2_1000/README.json create mode 100644 tests/fixtures/datasets/ss2_4000/README.json diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py index c074ab2..ee7d37b 100644 --- a/tests/dataset_fixture.py +++ b/tests/dataset_fixture.py @@ -2,8 +2,10 @@ import os import glob +import boto3 import openpyxl import requests +from hca.util.pool import ThreadPool class DatasetFixture: @@ -21,6 +23,7 @@ class DatasetFixture: """ def __init__(self, dataset_name, deployment): + self.s3_client = boto3.client('s3') self.name = dataset_name if deployment == "prod": # Metadata uses master branch for prod schemas @@ -35,6 +38,93 @@ def __init__(self, dataset_name, deployment): self.config = json.load(json_data) self.config["spreadsheet_location"] = self.config["spreadsheet_location"].replace("DEPLOYMENT", self.deployment) self._download_spreadsheet() + if self.config["generate_scaled_spreadsheet"] == True: + self._generate_scaled_spreadsheet_and_data_files() + + def _generate_scaled_spreadsheet_and_data_files(self): + self._scale_spreadsheet_cell_suspensions() + self._scale_sequence_files() + + def _scale_spreadsheet_cell_suspensions(self): + cell_suspension_tab = self.spreadsheet['Cell suspension'] + row_to_copy = self._fetch_row_with_headers(cell_suspension_tab, 6) + num_rows_to_copy = self.config["expected_bundle_count"] + for row_idx in range(2, num_rows_to_copy + 1): + new_row = row_to_copy.copy() + new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx)) + new_row["cell_suspension.plate_based_sequencing.plate_id"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_id"] + 1 + new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}" + cell_suspension_tab.append(list(new_row.values())) + self.spreadsheet.save(filename=self.metadata_spreadsheet_path) + + def _scale_sequence_files(self): + sequence_file_tab = self.spreadsheet['Sequence file'] + first_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 6) + second_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 7) + num_rows_to_copy = self.config["expected_bundle_count"] + orig_filename_1 = first_file_row_to_copy['sequence_file.file_core.file_name'] + orig_filename_2 = second_file_row_to_copy['sequence_file.file_core.file_name'] + pool = ThreadPool() + pool.add_task(self._copy_sequence_file, orig_filename_1, orig_filename_1) + pool.add_task(self._copy_sequence_file, orig_filename_2, orig_filename_2) + for row_idx in range(2, num_rows_to_copy + 1): + new_first_file_row = first_file_row_to_copy.copy() + new_second_file_row = second_file_row_to_copy.copy() + new_filename_1 = f"{row_idx}_{orig_filename_1}" + new_filename_2 = f"{row_idx}_{orig_filename_2}" + new_first_file_row["sequence_file.file_core.file_name"] = new_filename_1 + new_second_file_row["sequence_file.file_core.file_name"] = new_filename_2 + new_first_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = first_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx)) + new_second_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = second_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx)) + new_first_file_row["process.process_core.process_id"] = row_idx + new_second_file_row["process.process_core.process_id"] = row_idx + sequence_file_tab.append(list(new_first_file_row.values())) + sequence_file_tab.append(list(new_second_file_row.values())) + pool.add_task(self._copy_sequence_file, orig_filename_1, new_filename_1) + pool.add_task(self._copy_sequence_file, orig_filename_2, new_filename_2) + pool.wait_for_completion() + self.spreadsheet.save(filename=self.metadata_spreadsheet_path) + + def _copy_sequence_file(self, source_file_name, target_file_name): + source_s3_prefix = self.config["orig_copy_files_location"] + source_s3_path = f"{source_s3_prefix}{source_file_name}" + s3_path_split = source_s3_path.replace("s3://", "").split("/", 1) + source_bucket = s3_path_split[0] + source_key = s3_path_split[1] + + target_s3_prefix = self.config["data_files_location"] + target_s3_path = f"{target_s3_prefix}{target_file_name}" + s3_path_split = target_s3_path.replace("s3://", "").split("/", 1) + target_bucket = s3_path_split[0] + target_key = s3_path_split[1] + + copy_source = { + 'Bucket': source_bucket, + 'Key': source_key + } + upload_args = { + 'CopySource': copy_source, + 'Bucket': target_bucket, + 'Key': target_key + } + print(f"copying {source_s3_path} to {target_s3_path}") + self.s3_client.copy(**upload_args) + + def _fetch_row_with_headers(self, worksheet, row_idx): + row = {} + headers = self._fetch_row_values(worksheet, "A4:AG4") + value_idxs = f"A{row_idx}:AG{row_idx}" + values = self._fetch_row_values(worksheet, value_idxs) + for idx, val in enumerate(headers): + row[val] = values[idx] + return row + + def _fetch_row_values(self, ws, n): + values = [] + for row in ws.iter_rows(n): + for cell in row: + values.append(cell.value) + return values def _download_spreadsheet(self): response = requests.get(self.config["spreadsheet_location"]) @@ -63,7 +153,11 @@ def count_of_rows_in_spreadsheet_tab(self, tab_name, header_rows=5): ws = self.spreadsheet[tab_name] rows_with_content = 0 row = header_rows + 1 - while ws.cell(row=row, column=1).value: - rows_with_content += 1 + extra_rows_to_check = 10 + while extra_rows_to_check > 0: + if ws.cell(row=row, column=1).value: + rows_with_content += 1 + else: + extra_rows_to_check -= 1 row += 1 return rows_with_content diff --git a/tests/fixtures/.DS_Store b/tests/fixtures/.DS_Store index bba90c17e6ad70cae0bfad2b1dec88e10e2dd3ab..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 100644 GIT binary patch delta 68 zcmZoMXfc=|&Zs)EP;8=}A_oHyFfuR*Y<#H3KJi1>W_At%4o20D55F@{<`+>E1WGX^ QfYbm1h~2Q+QRFZ)06)qPH2?qr delta 157 zcmZoMXfc=|&e%RNQEZ}~q9_vs0|O%ig8&0V5<@(LA%g*fL0Ry`M0IzN1Sdl}Lk>eG zLq4)(QC?1dUOGe@0~bRuLoP!iLlHv>gDyicLn=cdgHc&9Le0iRQ})RQA}pJ^Ie0i2 d+czeDXP(S2q9_P5UIB;`fEWxmJBl1;1^|H89%ldm diff --git a/tests/fixtures/datasets/gliob_100/README.json b/tests/fixtures/datasets/gliob_100/README.json deleted file mode 100644 index 584be5a..0000000 --- a/tests/fixtures/datasets/gliob_100/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_100/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_100_glioblastoma_cells.xlsx", - "expected_bundle_count": 100 -} diff --git a/tests/fixtures/datasets/gliob_1000/README.json b/tests/fixtures/datasets/gliob_1000/README.json deleted file mode 100644 index 94b621a..0000000 --- a/tests/fixtures/datasets/gliob_1000/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_1000/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_1000_glioblastoma_cells.xlsx", - "expected_bundle_count": 1000 -} diff --git a/tests/fixtures/datasets/gliob_200/README.json b/tests/fixtures/datasets/gliob_200/README.json deleted file mode 100644 index 46c55b1..0000000 --- a/tests/fixtures/datasets/gliob_200/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_200/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_200_glioblastoma_cells.xlsx", - "expected_bundle_count": 200 -} diff --git a/tests/fixtures/datasets/gliob_400/README.json b/tests/fixtures/datasets/gliob_400/README.json deleted file mode 100644 index 7757875..0000000 --- a/tests/fixtures/datasets/gliob_400/README.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_400/", - "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_400_glioblastoma_cells.xlsx", - "expected_bundle_count": 400 -} diff --git a/tests/fixtures/datasets/ss2_100/README.json b/tests/fixtures/datasets/ss2_100/README.json new file mode 100644 index 0000000..41cb519 --- /dev/null +++ b/tests/fixtures/datasets/ss2_100/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-100/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 100, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/fixtures/datasets/ss2_1000/README.json b/tests/fixtures/datasets/ss2_1000/README.json new file mode 100644 index 0000000..126e50a --- /dev/null +++ b/tests/fixtures/datasets/ss2_1000/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-1000/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 1000, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/fixtures/datasets/ss2_4000/README.json b/tests/fixtures/datasets/ss2_4000/README.json new file mode 100644 index 0000000..6d7fc75 --- /dev/null +++ b/tests/fixtures/datasets/ss2_4000/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-4000/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 4000, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} diff --git a/tests/scale/test_big_bundles.py b/tests/scale/test_big_bundles.py index c18f88a..f08b3f5 100644 --- a/tests/scale/test_big_bundles.py +++ b/tests/scale/test_big_bundles.py @@ -8,19 +8,16 @@ class TestBigBundles(unittest.TestCase): def test_one_submission_with_100_bundles(self): - self._run(fixture_name='gliob_100') - - def test_one_submission_with_200_bundles(self): - self._run(fixture_name='gliob_200') - - def test_one_submission_with_400_bundles(self): - self._run(fixture_name='gliob_400') + self._run(fixture_name='ss2_100') def test_one_submission_with_1000_bundles(self): - self._run(fixture_name='gliob_1000') + self._run(fixture_name='ss2_1000') + + def test_one_submission_with_4000_bundles(self): + self._run(fixture_name='ss2_4000') def test_ingest_and_upload_only_1000_bundle_submissions(self): - self._run(fixture_name='gliob_1000', export_bundles=False) + self._run(fixture_name='ss2_1000', export_bundles=False) def _run(self, fixture_name, export_bundles=True): print("") From 7cdfd86765122ae6773c1cb88bd9557b988b007d Mon Sep 17 00:00:00 2001 From: parthshahva Date: Fri, 22 Mar 2019 11:00:25 -0700 Subject: [PATCH 2/6] changes --- tests/dataset_fixture.py | 4 ++-- tests/scale/test_big_bundles.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py index ee7d37b..34604e6 100644 --- a/tests/dataset_fixture.py +++ b/tests/dataset_fixture.py @@ -23,7 +23,6 @@ class DatasetFixture: """ def __init__(self, dataset_name, deployment): - self.s3_client = boto3.client('s3') self.name = dataset_name if deployment == "prod": # Metadata uses master branch for prod schemas @@ -86,6 +85,7 @@ def _scale_sequence_files(self): self.spreadsheet.save(filename=self.metadata_spreadsheet_path) def _copy_sequence_file(self, source_file_name, target_file_name): + s3_client = boto3.client('s3') source_s3_prefix = self.config["orig_copy_files_location"] source_s3_path = f"{source_s3_prefix}{source_file_name}" s3_path_split = source_s3_path.replace("s3://", "").split("/", 1) @@ -108,7 +108,7 @@ def _copy_sequence_file(self, source_file_name, target_file_name): 'Key': target_key } print(f"copying {source_s3_path} to {target_s3_path}") - self.s3_client.copy(**upload_args) + s3_client.copy(**upload_args) def _fetch_row_with_headers(self, worksheet, row_idx): row = {} diff --git a/tests/scale/test_big_bundles.py b/tests/scale/test_big_bundles.py index f08b3f5..4bdef0a 100644 --- a/tests/scale/test_big_bundles.py +++ b/tests/scale/test_big_bundles.py @@ -13,6 +13,9 @@ def test_one_submission_with_100_bundles(self): def test_one_submission_with_1000_bundles(self): self._run(fixture_name='ss2_1000') + def test_one_submission_with_2000_bundles(self): + self._run(fixture_name='ss2_2000') + def test_one_submission_with_4000_bundles(self): self._run(fixture_name='ss2_4000') From a54685f7d2540517a8ed6f293537b3517ff02b1d Mon Sep 17 00:00:00 2001 From: parthshahva Date: Sat, 4 May 2019 22:18:11 -0700 Subject: [PATCH 3/6] change plate id to plate well and add fixture for 2000 cell test --- .gitlab-ci.yml | 2 -- tests/dataset_fixture.py | 2 +- tests/fixtures/datasets/ss2_2000/README.json | 7 +++++++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/datasets/ss2_2000/README.json diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b193d89..64520b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,8 +2,6 @@ image: python:3.6 stages: - dcp_wide_test - - 1000_cell_test - - 100_cell_test before_script: - apt-get -y update diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py index 34604e6..e760683 100644 --- a/tests/dataset_fixture.py +++ b/tests/dataset_fixture.py @@ -51,7 +51,7 @@ def _scale_spreadsheet_cell_suspensions(self): for row_idx in range(2, num_rows_to_copy + 1): new_row = row_to_copy.copy() new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx)) - new_row["cell_suspension.plate_based_sequencing.plate_id"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_id"] + 1 + new_row["cell_suspension.plate_based_sequencing.plate_label"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_label"] + 1 new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}" cell_suspension_tab.append(list(new_row.values())) self.spreadsheet.save(filename=self.metadata_spreadsheet_path) diff --git a/tests/fixtures/datasets/ss2_2000/README.json b/tests/fixtures/datasets/ss2_2000/README.json new file mode 100644 index 0000000..cb2f8f1 --- /dev/null +++ b/tests/fixtures/datasets/ss2_2000/README.json @@ -0,0 +1,7 @@ +{ + "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-2000/", + "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx", + "expected_bundle_count": 2000, + "generate_scaled_spreadsheet": true, + "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/" +} From 28bcffd2a1fca97deb70af5d089c6e1d98f0065b Mon Sep 17 00:00:00 2001 From: parthshahva Date: Wed, 28 Aug 2019 13:06:23 -0700 Subject: [PATCH 4/6] run scale test in gitlab --- .gitlab-ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 25935e6..b5a28bc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -40,3 +40,10 @@ dcp_wide_test_optimus: - staging script: - python -m unittest tests.integration.test_end_to_end_dcp.TestOptimusRun.test_optimus_run + +dcp_wide_ss2_scale_test_4000: + stage: dcp_wide_test + only: + - scale-out-spreadsheet-and-files + script: + CI_COMMIT_REF_NAME=integration DEPLOYMENT_STAGE=integration python -m unittest tests.scale.test_big_bundles.TestBigBundles.test_one_submission_with_4000_bundles From f0a965f0f84c5a4054e78f02dad385bd879b5c09 Mon Sep 17 00:00:00 2001 From: parthshahva Date: Wed, 28 Aug 2019 13:08:37 -0700 Subject: [PATCH 5/6] hardcode integration branch --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b5a28bc..b715e83 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,6 +7,7 @@ before_script: - apt-get -y update - apt-get -y install jq - pip install -r requirements.txt + - export CI_COMMIT_REF_NAME=integration - export DEPLOYMENT_ENV=$CI_COMMIT_REF_NAME - export SWAGGER_URL="https://dss.$DEPLOYMENT_ENV.data.humancellatlas.org/v1/swagger.json" - mkdir -p ~/.config/hca From 23819713d2f3b85f941ff3846f2399965cd0e172 Mon Sep 17 00:00:00 2001 From: parthshahva Date: Wed, 28 Aug 2019 13:11:41 -0700 Subject: [PATCH 6/6] freeze openpyxl --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bcc3257..92d7e70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ iso8601 requests urllib3 hca>=4.9.0 -openpyxl +openpyxl==2.3.5 awscli hca-ingest cromwell-tools>=1.1.2