From bf79eb9ba356f8f1bec04bb733a2a98073c73bcb Mon Sep 17 00:00:00 2001
From: parthshahva <parthshahva@gmail.com>
Date: Tue, 29 Jan 2019 14:36:21 -0800
Subject: [PATCH 1/6] wip commit for scaling out spreadsheet and data files

---
 tests/dataset_fixture.py                      |  98 +++++++++++++++++-
 tests/fixtures/.DS_Store                      | Bin 6148 -> 6148 bytes
 tests/fixtures/datasets/gliob_100/README.json |   5 -
 .../fixtures/datasets/gliob_1000/README.json  |   5 -
 tests/fixtures/datasets/gliob_200/README.json |   5 -
 tests/fixtures/datasets/gliob_400/README.json |   5 -
 tests/fixtures/datasets/ss2_100/README.json   |   7 ++
 tests/fixtures/datasets/ss2_1000/README.json  |   7 ++
 tests/fixtures/datasets/ss2_4000/README.json  |   7 ++
 tests/scale/test_big_bundles.py               |  15 ++-
 10 files changed, 123 insertions(+), 31 deletions(-)
 delete mode 100644 tests/fixtures/datasets/gliob_100/README.json
 delete mode 100644 tests/fixtures/datasets/gliob_1000/README.json
 delete mode 100644 tests/fixtures/datasets/gliob_200/README.json
 delete mode 100644 tests/fixtures/datasets/gliob_400/README.json
 create mode 100644 tests/fixtures/datasets/ss2_100/README.json
 create mode 100644 tests/fixtures/datasets/ss2_1000/README.json
 create mode 100644 tests/fixtures/datasets/ss2_4000/README.json

diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py
index c074ab2..ee7d37b 100644
--- a/tests/dataset_fixture.py
+++ b/tests/dataset_fixture.py
@@ -2,8 +2,10 @@
 import os
 import glob
 
+import boto3
 import openpyxl
 import requests
+from hca.util.pool import ThreadPool
 
 
 class DatasetFixture:
@@ -21,6 +23,7 @@ class DatasetFixture:
     """
 
     def __init__(self, dataset_name, deployment):
+        self.s3_client = boto3.client('s3')
         self.name = dataset_name
         if deployment == "prod":
             # Metadata uses master branch for prod schemas
@@ -35,6 +38,93 @@ def __init__(self, dataset_name, deployment):
             self.config = json.load(json_data)
             self.config["spreadsheet_location"] = self.config["spreadsheet_location"].replace("DEPLOYMENT", self.deployment)
         self._download_spreadsheet()
+        if self.config["generate_scaled_spreadsheet"] == True:
+            self._generate_scaled_spreadsheet_and_data_files()
+
+    def _generate_scaled_spreadsheet_and_data_files(self):
+        self._scale_spreadsheet_cell_suspensions()
+        self._scale_sequence_files()
+
+    def _scale_spreadsheet_cell_suspensions(self):
+        cell_suspension_tab = self.spreadsheet['Cell suspension']
+        row_to_copy = self._fetch_row_with_headers(cell_suspension_tab, 6)
+        num_rows_to_copy = self.config["expected_bundle_count"]
+        for row_idx in range(2, num_rows_to_copy + 1):
+            new_row = row_to_copy.copy()
+            new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx))
+            new_row["cell_suspension.plate_based_sequencing.plate_id"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_id"] + 1
+            new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}"
+            cell_suspension_tab.append(list(new_row.values()))
+        self.spreadsheet.save(filename=self.metadata_spreadsheet_path)
+
+    def _scale_sequence_files(self):
+        sequence_file_tab = self.spreadsheet['Sequence file']
+        first_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 6)
+        second_file_row_to_copy = self._fetch_row_with_headers(sequence_file_tab, 7)
+        num_rows_to_copy = self.config["expected_bundle_count"]
+        orig_filename_1 = first_file_row_to_copy['sequence_file.file_core.file_name']
+        orig_filename_2 = second_file_row_to_copy['sequence_file.file_core.file_name']
+        pool = ThreadPool()
+        pool.add_task(self._copy_sequence_file, orig_filename_1, orig_filename_1)
+        pool.add_task(self._copy_sequence_file, orig_filename_2, orig_filename_2)
+        for row_idx in range(2, num_rows_to_copy + 1):
+            new_first_file_row = first_file_row_to_copy.copy()
+            new_second_file_row = second_file_row_to_copy.copy()
+            new_filename_1 = f"{row_idx}_{orig_filename_1}"
+            new_filename_2 = f"{row_idx}_{orig_filename_2}"
+            new_first_file_row["sequence_file.file_core.file_name"] = new_filename_1
+            new_second_file_row["sequence_file.file_core.file_name"] = new_filename_2
+            new_first_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = first_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx))
+            new_second_file_row["cell_suspension.biomaterial_core.biomaterial_id"] = second_file_row_to_copy['cell_suspension.biomaterial_core.biomaterial_id'].replace("1", str(row_idx))
+            new_first_file_row["process.process_core.process_id"] = row_idx
+            new_second_file_row["process.process_core.process_id"] = row_idx
+            sequence_file_tab.append(list(new_first_file_row.values()))
+            sequence_file_tab.append(list(new_second_file_row.values()))
+            pool.add_task(self._copy_sequence_file, orig_filename_1, new_filename_1)
+            pool.add_task(self._copy_sequence_file, orig_filename_2, new_filename_2)
+        pool.wait_for_completion()
+        self.spreadsheet.save(filename=self.metadata_spreadsheet_path)
+
+    def _copy_sequence_file(self, source_file_name, target_file_name):
+        source_s3_prefix = self.config["orig_copy_files_location"]
+        source_s3_path = f"{source_s3_prefix}{source_file_name}"
+        s3_path_split = source_s3_path.replace("s3://", "").split("/", 1)
+        source_bucket = s3_path_split[0]
+        source_key = s3_path_split[1]
+
+        target_s3_prefix = self.config["data_files_location"]
+        target_s3_path = f"{target_s3_prefix}{target_file_name}"
+        s3_path_split = target_s3_path.replace("s3://", "").split("/", 1)
+        target_bucket = s3_path_split[0]
+        target_key = s3_path_split[1]
+
+        copy_source = {
+            'Bucket': source_bucket,
+            'Key': source_key
+        }
+        upload_args = {
+            'CopySource': copy_source,
+            'Bucket': target_bucket,
+            'Key': target_key
+        }
+        print(f"copying {source_s3_path} to {target_s3_path}")
+        self.s3_client.copy(**upload_args)
+
+    def _fetch_row_with_headers(self, worksheet, row_idx):
+        row = {}
+        headers = self._fetch_row_values(worksheet, "A4:AG4")
+        value_idxs = f"A{row_idx}:AG{row_idx}"
+        values = self._fetch_row_values(worksheet, value_idxs)
+        for idx, val in enumerate(headers):
+            row[val] = values[idx]
+        return row
+
+    def _fetch_row_values(self, ws, n):
+        values = []
+        for row in ws.iter_rows(n):
+            for cell in row:
+                values.append(cell.value)
+        return values
 
     def _download_spreadsheet(self):
         response = requests.get(self.config["spreadsheet_location"])
@@ -63,7 +153,11 @@ def count_of_rows_in_spreadsheet_tab(self, tab_name, header_rows=5):
         ws = self.spreadsheet[tab_name]
         rows_with_content = 0
         row = header_rows + 1
-        while ws.cell(row=row, column=1).value:
-            rows_with_content += 1
+        extra_rows_to_check = 10
+        while extra_rows_to_check > 0:
+            if ws.cell(row=row, column=1).value:
+                rows_with_content += 1
+            else:
+                extra_rows_to_check -= 1
             row += 1
         return rows_with_content
diff --git a/tests/fixtures/.DS_Store b/tests/fixtures/.DS_Store
index bba90c17e6ad70cae0bfad2b1dec88e10e2dd3ab..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 100644
GIT binary patch
delta 68
zcmZoMXfc=|&Zs)EP;8=}A_oHyFfuR*Y<#H3KJi1>W_At%4o20D55F@{<`+>E1WGX^
QfYbm1h~2Q+QRFZ)06)qPH2?qr

delta 157
zcmZoMXfc=|&e%RNQEZ}~q9_vs0|O%ig8&0V5<@(LA%g*fL0Ry`M0IzN1Sdl}Lk>eG
zLq4)(QC?1dUOGe@0~bRuLoP!iLlHv>gDyicLn=cdgHc&9Le0iRQ})RQA}pJ^Ie0i2
d+czeDXP(S2q9_P5UIB;`fEWxmJBl1;1^|H89%ldm

diff --git a/tests/fixtures/datasets/gliob_100/README.json b/tests/fixtures/datasets/gliob_100/README.json
deleted file mode 100644
index 584be5a..0000000
--- a/tests/fixtures/datasets/gliob_100/README.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_100/",
-  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_100_glioblastoma_cells.xlsx",
-  "expected_bundle_count": 100
-}
diff --git a/tests/fixtures/datasets/gliob_1000/README.json b/tests/fixtures/datasets/gliob_1000/README.json
deleted file mode 100644
index 94b621a..0000000
--- a/tests/fixtures/datasets/gliob_1000/README.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_1000/",
-  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_1000_glioblastoma_cells.xlsx",
-  "expected_bundle_count": 1000
-}
diff --git a/tests/fixtures/datasets/gliob_200/README.json b/tests/fixtures/datasets/gliob_200/README.json
deleted file mode 100644
index 46c55b1..0000000
--- a/tests/fixtures/datasets/gliob_200/README.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_200/",
-  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_200_glioblastoma_cells.xlsx",
-  "expected_bundle_count": 200
-}
diff --git a/tests/fixtures/datasets/gliob_400/README.json b/tests/fixtures/datasets/gliob_400/README.json
deleted file mode 100644
index 7757875..0000000
--- a/tests/fixtures/datasets/gliob_400/README.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "data_files_location": "s3://org-humancellatlas-dcp-test-data/gliob_400/",
-  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_scale_test_metadata_400_glioblastoma_cells.xlsx",
-  "expected_bundle_count": 400
-}
diff --git a/tests/fixtures/datasets/ss2_100/README.json b/tests/fixtures/datasets/ss2_100/README.json
new file mode 100644
index 0000000..41cb519
--- /dev/null
+++ b/tests/fixtures/datasets/ss2_100/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-100/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 100,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/fixtures/datasets/ss2_1000/README.json b/tests/fixtures/datasets/ss2_1000/README.json
new file mode 100644
index 0000000..126e50a
--- /dev/null
+++ b/tests/fixtures/datasets/ss2_1000/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-1000/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 1000,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/fixtures/datasets/ss2_4000/README.json b/tests/fixtures/datasets/ss2_4000/README.json
new file mode 100644
index 0000000..6d7fc75
--- /dev/null
+++ b/tests/fixtures/datasets/ss2_4000/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-4000/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 4000,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}
diff --git a/tests/scale/test_big_bundles.py b/tests/scale/test_big_bundles.py
index c18f88a..f08b3f5 100644
--- a/tests/scale/test_big_bundles.py
+++ b/tests/scale/test_big_bundles.py
@@ -8,19 +8,16 @@
 class TestBigBundles(unittest.TestCase):
 
     def test_one_submission_with_100_bundles(self):
-        self._run(fixture_name='gliob_100')
-
-    def test_one_submission_with_200_bundles(self):
-        self._run(fixture_name='gliob_200')
-
-    def test_one_submission_with_400_bundles(self):
-        self._run(fixture_name='gliob_400')
+        self._run(fixture_name='ss2_100')
 
     def test_one_submission_with_1000_bundles(self):
-        self._run(fixture_name='gliob_1000')
+        self._run(fixture_name='ss2_1000')
+
+    def test_one_submission_with_4000_bundles(self):
+        self._run(fixture_name='ss2_4000')
 
     def test_ingest_and_upload_only_1000_bundle_submissions(self):
-        self._run(fixture_name='gliob_1000', export_bundles=False)
+        self._run(fixture_name='ss2_1000', export_bundles=False)
 
     def _run(self, fixture_name, export_bundles=True):
         print("")

From 7cdfd86765122ae6773c1cb88bd9557b988b007d Mon Sep 17 00:00:00 2001
From: parthshahva <parthshahva@gmail.com>
Date: Fri, 22 Mar 2019 11:00:25 -0700
Subject: [PATCH 2/6] changes

---
 tests/dataset_fixture.py        | 4 ++--
 tests/scale/test_big_bundles.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py
index ee7d37b..34604e6 100644
--- a/tests/dataset_fixture.py
+++ b/tests/dataset_fixture.py
@@ -23,7 +23,6 @@ class DatasetFixture:
     """
 
     def __init__(self, dataset_name, deployment):
-        self.s3_client = boto3.client('s3')
         self.name = dataset_name
         if deployment == "prod":
             # Metadata uses master branch for prod schemas
@@ -86,6 +85,7 @@ def _scale_sequence_files(self):
         self.spreadsheet.save(filename=self.metadata_spreadsheet_path)
 
     def _copy_sequence_file(self, source_file_name, target_file_name):
+        s3_client = boto3.client('s3')
         source_s3_prefix = self.config["orig_copy_files_location"]
         source_s3_path = f"{source_s3_prefix}{source_file_name}"
         s3_path_split = source_s3_path.replace("s3://", "").split("/", 1)
@@ -108,7 +108,7 @@ def _copy_sequence_file(self, source_file_name, target_file_name):
             'Key': target_key
         }
         print(f"copying {source_s3_path} to {target_s3_path}")
-        self.s3_client.copy(**upload_args)
+        s3_client.copy(**upload_args)
 
     def _fetch_row_with_headers(self, worksheet, row_idx):
         row = {}
diff --git a/tests/scale/test_big_bundles.py b/tests/scale/test_big_bundles.py
index f08b3f5..4bdef0a 100644
--- a/tests/scale/test_big_bundles.py
+++ b/tests/scale/test_big_bundles.py
@@ -13,6 +13,9 @@ def test_one_submission_with_100_bundles(self):
     def test_one_submission_with_1000_bundles(self):
         self._run(fixture_name='ss2_1000')
 
+    def test_one_submission_with_2000_bundles(self):
+        self._run(fixture_name='ss2_2000')
+
     def test_one_submission_with_4000_bundles(self):
         self._run(fixture_name='ss2_4000')
 

From a54685f7d2540517a8ed6f293537b3517ff02b1d Mon Sep 17 00:00:00 2001
From: parthshahva <parthshahva@gmail.com>
Date: Sat, 4 May 2019 22:18:11 -0700
Subject: [PATCH 3/6] change plate id to plate well and add fixture for 2000
 cell test

---
 .gitlab-ci.yml                               | 2 --
 tests/dataset_fixture.py                     | 2 +-
 tests/fixtures/datasets/ss2_2000/README.json | 7 +++++++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 tests/fixtures/datasets/ss2_2000/README.json

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b193d89..64520b6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,8 +2,6 @@ image: python:3.6
 
 stages:
   - dcp_wide_test
-  - 1000_cell_test
-  - 100_cell_test
 
 before_script:
   - apt-get -y update
diff --git a/tests/dataset_fixture.py b/tests/dataset_fixture.py
index 34604e6..e760683 100644
--- a/tests/dataset_fixture.py
+++ b/tests/dataset_fixture.py
@@ -51,7 +51,7 @@ def _scale_spreadsheet_cell_suspensions(self):
         for row_idx in range(2, num_rows_to_copy + 1):
             new_row = row_to_copy.copy()
             new_row["cell_suspension.biomaterial_core.biomaterial_id"] = row_to_copy["cell_suspension.biomaterial_core.biomaterial_id"].replace("1", str(row_idx))
-            new_row["cell_suspension.plate_based_sequencing.plate_id"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_id"] + 1
+            new_row["cell_suspension.plate_based_sequencing.plate_label"] = row_to_copy["cell_suspension.plate_based_sequencing.plate_label"] + 1
             new_row["cell_suspension.plate_based_sequencing.well_id"] = f"A{row_idx}"
             cell_suspension_tab.append(list(new_row.values()))
         self.spreadsheet.save(filename=self.metadata_spreadsheet_path)
diff --git a/tests/fixtures/datasets/ss2_2000/README.json b/tests/fixtures/datasets/ss2_2000/README.json
new file mode 100644
index 0000000..cb2f8f1
--- /dev/null
+++ b/tests/fixtures/datasets/ss2_2000/README.json
@@ -0,0 +1,7 @@
+{
+  "data_files_location": "s3://org-humancellatlas-dcp-test-data/ss2-2000/",
+  "spreadsheet_location": "https://raw.github.com/HumanCellAtlas/metadata-schema/DEPLOYMENT/infrastructure_testing_files/current/dcp_integration_test_metadata_1_SS2_bundle.xlsx",
+  "expected_bundle_count": 2000,
+  "generate_scaled_spreadsheet": true,
+  "orig_copy_files_location": "s3://org-humancellatlas-dcp-test-data/smart-seq2-one-bundle/"
+}

From 28bcffd2a1fca97deb70af5d089c6e1d98f0065b Mon Sep 17 00:00:00 2001
From: parthshahva <parthshahva@gmail.com>
Date: Wed, 28 Aug 2019 13:06:23 -0700
Subject: [PATCH 4/6] run scale test in gitlab

---
 .gitlab-ci.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 25935e6..b5a28bc 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -40,3 +40,10 @@ dcp_wide_test_optimus:
     - staging
   script:
     - python -m unittest tests.integration.test_end_to_end_dcp.TestOptimusRun.test_optimus_run
+
+dcp_wide_ss2_scale_test_4000:
+  stage: dcp_wide_test
+  only:
+    - scale-out-spreadsheet-and-files
+  script:
+    CI_COMMIT_REF_NAME=integration DEPLOYMENT_STAGE=integration python -m unittest tests.scale.test_big_bundles.TestBigBundles.test_one_submission_with_4000_bundles

From f0a965f0f84c5a4054e78f02dad385bd879b5c09 Mon Sep 17 00:00:00 2001
From: parthshahva <parthshahva@gmail.com>
Date: Wed, 28 Aug 2019 13:08:37 -0700
Subject: [PATCH 5/6] hardcode integration branch

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b5a28bc..b715e83 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -7,6 +7,7 @@ before_script:
   - apt-get -y update
   - apt-get -y install jq
   - pip install -r requirements.txt
+  - export CI_COMMIT_REF_NAME=integration
   - export DEPLOYMENT_ENV=$CI_COMMIT_REF_NAME
   - export SWAGGER_URL="https://dss.$DEPLOYMENT_ENV.data.humancellatlas.org/v1/swagger.json"
   - mkdir -p ~/.config/hca

From 23819713d2f3b85f941ff3846f2399965cd0e172 Mon Sep 17 00:00:00 2001
From: parthshahva <parthshahva@gmail.com>
Date: Wed, 28 Aug 2019 13:11:41 -0700
Subject: [PATCH 6/6] freeze openpyxl

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index bcc3257..92d7e70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ iso8601
 requests
 urllib3
 hca>=4.9.0
-openpyxl
+openpyxl==2.3.5
 awscli
 hca-ingest
 cromwell-tools>=1.1.2