HumanCellAtlas · parthshahva · Jan 29, 2019 · Feb 19, 2019 · Feb 22, 2019 · Mar 22, 2019
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -2,13 +2,12 @@ image: python:3.6
 
 stages:
   - dcp_wide_test
-  - 1000_cell_test
-  - 100_cell_test
 
 before_script:
   - apt-get -y update
   - apt-get -y install jq
   - pip install -r requirements.txt
+  - export CI_COMMIT_REF_NAME=integration
   - export DEPLOYMENT_ENV=$CI_COMMIT_REF_NAME
   - export SWAGGER_URL="https://dss.$DEPLOYMENT_ENV.data.humancellatlas.org/v1/swagger.json"
   - mkdir -p ~/.config/hca
@@ -42,3 +41,10 @@ dcp_wide_test_optimus:
     - staging
   script:
     - python -m unittest tests.integration.test_end_to_end_dcp.TestOptimusRun.test_optimus_run
+
+dcp_wide_ss2_scale_test_4000:
+  stage: dcp_wide_test
+  only:
+    - scale-out-spreadsheet-and-files
+  script:
+    CI_COMMIT_REF_NAME=integration DEPLOYMENT_STAGE=integration python -m unittest tests.scale.test_big_bundles.TestBigBundles.test_one_submission_with_4000_bundles
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@ iso8601
 requests
 urllib3
 hca>=4.9.0
-openpyxl
+openpyxl==2.3.5
 awscli
 hca-ingest
 cromwell-tools>=1.1.2

diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py
@@ -2,8 +2,10 @@
 import os
 import glob
 
+import boto3
 import openpyxl
 import requests
+from hca.util.pool import ThreadPool
 
 
 class DatasetFixture:
@@ -35,6 +37,94 @@ def __init__(self, dataset_name, deployment):
             self.config = json.load(json_data)
             self.config["spreadsheet_location"] = self.config["spreadsheet_location"].replace("DEPLOYMENT", self.deployment)
         self._download_spreadsheet()
+        if self.config["generate_scaled_spreadsheet"] == True:
+            self._generate_scaled_spreadsheet_and_data_files()
+
+    def _generate_scaled_spreadsheet_and_data_files(self):
+        self._scale_spreadsheet_cell_suspensions()
+        self._scale_sequence_files()
+
+    def _scale_spreadsheet_cell_suspensions(self):
+        cell_suspension_tab = self.spreadsheet['Cell suspension']
+        row_to_copy = self._fetch_row_with_headers(cell_suspension_tab, 6)
+        num_rows_to_copy = self.config["expected_bundle_count"]
+        for row_idx in range(2, num_rows_to_copy + 1):
+            new_row = row_to_copy.copy()
+            new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx))
+            new_row["cell_suspension.plate_based_sequencing.plate_label"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_label"] + 1
+            new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}"
+            cell_suspension_tab.append(list(new_row.values()))
+        self.spreadsheet.save(filename=self.metadata_spreadsheet_path)
+
+    def _scale_sequence_files(self):
+        sequence_file_tab = self.spreadsheet['Sequence file']
+        first_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 6)
+        second_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 7)
+        num_rows_to_copy = self.config["expected_bundle_count"]
+        orig_filename_1 = first_file_row_to_copy['sequence_file.file_core.file_name']
+        orig_filename_2 = second_file_row_to_copy['sequence_file.file_core.file_name']
+        pool = ThreadPool()
+        pool.add_task(self._copy_sequence_file, orig_filename_1, orig_filename_1)
+        pool.add_task(self._copy_sequence_file, orig_filename_2, orig_filename_2)
+        for row_idx in range(2, num_rows_to_copy + 1):
+            new_first_file_row = first_file_row_to_copy.copy()
+            new_second_file_row = second_file_row_to_copy.copy()
+            new_filename_1 = f"{row_idx}_{orig_filename_1}"
+            new_filename_2 = f"{row_idx}_{orig_filename_2}"
+            new_first_file_row["sequence_file.file_core.file_name"] = new_filename_1
+            new_second_file_row["sequence_file.file_core.file_name"] = new_filename_2
+            new_first_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = first_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx))
+            new_second_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = second_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx))
+            new_first_file_row["process.process_core.process_id"] = row_idx
+            new_second_file_row["process.process_core.process_id"] = row_idx
+            sequence_file_tab.append(list(new_first_file_row.values()))
+            sequence_file_tab.append(list(new_second_file_row.values()))
+            pool.add_task(self._copy_sequence_file, orig_filename_1, new_filename_1)
+            pool.add_task(self._copy_sequence_file, orig_filename_2, new_filename_2)
+        pool.wait_for_completion()
+        self.spreadsheet.save(filename=self.metadata_spreadsheet_path)
+
+    def _copy_sequence_file(self, source_file_name, target_file_name):
+        s3_client = boto3.client('s3')
+        source_s3_prefix = self.config["orig_copy_files_location"]
+        source_s3_path = f"{source_s3_prefix}{source_file_name}"
+        s3_path_split = source_s3_path.replace("s3://", "").split("/", 1)
+        source_bucket = s3_path_split[0]
+        source_key = s3_path_split[1]
+
+        target_s3_prefix = self.config["data_files_location"]
+        target_s3_path = f"{target_s3_prefix}{target_file_name}"
+        s3_path_split = target_s3_path.replace("s3://", "").split("/", 1)
+        target_bucket = s3_path_split[0]
+        target_key = s3_path_split[1]
+
+        copy_source = {
+            'Bucket': source_bucket,
+            'Key': source_key
+        }
+        upload_args = {
+            'CopySource': copy_source,
+            'Bucket': target_bucket,
+            'Key': target_key
+        }
+        print(f"copying {source_s3_path} to {target_s3_path}")
+        s3_client.copy(**upload_args)
+
+    def _fetch_row_with_headers(self, worksheet, row_idx):
+        row = {}
+        headers = self._fetch_row_values(worksheet, "A4:AG4")
+        value_idxs = f"A{row_idx}:AG{row_idx}"
+        values = self._fetch_row_values(worksheet, value_idxs)
+        for idx, val in enumerate(headers):
+            row[val] = values[idx]
+        return row
+
+    def _fetch_row_values(self, ws, n):
+        values = []
+        for row in ws.iter_rows(n):
+            for cell in row:
+                values.append(cell.value)
+        return values
 
     def _download_spreadsheet(self):
         response = requests.get(self.config["spreadsheet_location"])
@@ -63,7 +153,11 @@ def count_of_rows_in_spreadsheet_tab(self, tab_name, header_rows=5):
         ws = self.spreadsheet[tab_name]
         rows_with_content = 0
         row = header_rows + 1
-        while ws.cell(row=row, column=1).value:
-            rows_with_content += 1
+        extra_rows_to_check = 10
+        while extra_rows_to_check > 0:
+            if ws.cell(row=row, column=1).value:
+                rows_with_content += 1
+            else:
+                extra_rows_to_check -= 1
             row += 1
         return rows_with_content
diff --git a/tests/fixtures/.DS_Store b/tests/fixtures/.DS_Store
diff --git a/tests/fixtures/datasets/gliob_100/README.json b/tests/fixtures/datasets/gliob_100/README.json
diff --git a/tests/fixtures/datasets/gliob_1000/README.json b/tests/fixtures/datasets/gliob_1000/README.json
diff --git a/tests/fixtures/datasets/gliob_200/README.json b/tests/fixtures/datasets/gliob_200/README.json
diff --git a/tests/fixtures/datasets/gliob_400/README.json b/tests/fixtures/datasets/gliob_400/README.json
diff --git a/tests/fixtures/datasets/ss2_100/README.json b/tests/fixtures/datasets/ss2_100/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-100/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 100,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/fixtures/datasets/ss2_1000/README.json b/tests/fixtures/datasets/ss2_1000/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-1000/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 1000,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/fixtures/datasets/ss2_2000/README.json b/tests/fixtures/datasets/ss2_2000/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-2000/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 2000,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/fixtures/datasets/ss2_4000/README.json b/tests/fixtures/datasets/ss2_4000/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-4000/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 4000,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/scale/test_big_bundles.py b/tests/scale/test_big_bundles.py
@@ -8,19 +8,19 @@
 class TestBigBundles(unittest.TestCase):
 
     def test_one_submission_with_100_bundles(self):
-        self._run(fixture_name='gliob_100')
+        self._run(fixture_name='ss2_100')
 
-    def test_one_submission_with_200_bundles(self):
-        self._run(fixture_name='gliob_200')
+    def test_one_submission_with_1000_bundles(self):
+        self._run(fixture_name='ss2_1000')
 
-    def test_one_submission_with_400_bundles(self):
-        self._run(fixture_name='gliob_400')
+    def test_one_submission_with_2000_bundles(self):
+        self._run(fixture_name='ss2_2000')
 
-    def test_one_submission_with_1000_bundles(self):
-        self._run(fixture_name='gliob_1000')
+    def test_one_submission_with_4000_bundles(self):
+        self._run(fixture_name='ss2_4000')
 
     def test_ingest_and_upload_only_1000_bundle_submissions(self):
-        self._run(fixture_name='gliob_1000', export_bundles=False)
+        self._run(fixture_name='ss2_1000', export_bundles=False)
 
     def _run(self, fixture_name, export_bundles=True):
         print("")