Merge pull request #602 from splitgraph/bugfix/cli-csv-ingestion

Various fixes for sgr cloud CLIs
splitgraph · Dec 30, 2021 · ef4c785 · ef4c785
2 parents dfdb892 + 85ae2c7
commit ef4c785
Show file tree

Hide file tree

Showing 13 changed files with 253 additions and 20 deletions.
diff --git a/splitgraph/cloud/project/dbt.py b/splitgraph/cloud/project/dbt.py
@@ -78,6 +78,7 @@ def generate_dbt_plugin_params(repositories: List[str]) -> Tuple[Dict[str, Any],
     # the Git pull URL at action runtime (using GITHUB_TOKEN).
     credentials = {"git_url": "$THIS_REPO_URL"}
 
-    params = {"sources": [_make_source(r) for r in repositories]}
+    # Same with the branch: we want to inject the current SHA we're running the action for.
+    params = {"sources": [_make_source(r) for r in repositories], "git_branch": "$THIS_SHA"}
 
     return params, credentials
diff --git a/splitgraph/cloud/project/github_actions.py b/splitgraph/cloud/project/github_actions.py
@@ -30,7 +30,8 @@ def generate_job(
             {
                 "name": "Set up dbt Git URL",
                 "run": 'echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && '
-                'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml',
+                'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml && '
+                'sed -i "s|\\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml',
                 "shell": "bash",
                 "env": {
                     "CREDENTIALS_YML": "${{secrets.SPLITGRAPH_CREDENTIALS_YML}}",
@@ -54,7 +55,7 @@ def generate_job(
         steps.append(
             {
                 "name": "Run sgr cloud load to set up metadata and data source settings",
-                "run": "sgr cloud load --remote splitgraph "
+                "run": "sgr cloud load --remote splitgraph --initial-private "
                 f"-f splitgraph.yml -f splitgraph.credentials.yml {repository}",
                 "shell": "bash",
             }

diff --git a/splitgraph/cloud/project/templates.py b/splitgraph/cloud/project/templates.py
@@ -67,7 +67,7 @@
     # Here as a starting point. You can reference these models downstream in models that actually
     # materialize as tables.
     staging:
-      +materialized: cte
+      +materialized: ephemeral
 """
 
 SOURCES_YML_TEMPLATE = """# This file defines all data sources referenced by this model. The mapping
@@ -132,6 +132,118 @@
     "SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
     your deployment URL if you're on a private deployment).
 
+### Edit `splitgraph.yml`
+
+We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
+parameters JSONSchema. You should review it and add suitable plugin settings:
+
+  - set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
+    options of the data source (by default, it adds a sample table into the project file)
+  - change and customize the `metadata` block
+  - set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
+    and offers a list of alternative subobjects, choose one entry from the list and delete
+    the list itself, leaving the object at the top level.
+
+Example:
+
+```yaml
+- namespace: my_namespace
+  repository: csv
+  # Catalog-specific metadata for the repository. Optional.
+  metadata:
+    readme:
+      text: Readme
+    description: Description of the repository
+    topics:
+    - sample_topic
+  # Data source settings for the repository. Optional.
+  external:
+    # Name of the credential that the plugin uses. This can also be a credential_id if the
+    # credential is already registered on Splitgraph.
+    credential: csv
+    plugin: csv
+    # Plugin-specific parameters matching the plugin's parameters schema
+    params:
+      connection:  # Choose one of:
+      - connection_type: http  # REQUIRED. Constant
+        url: '' # REQUIRED. HTTP URL to the CSV file
+      - connection_type: s3  # REQUIRED. Constant
+        s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
+        s3_bucket: '' # REQUIRED. Bucket the object is in
+        s3_region: '' # Region of the S3 bucket
+        s3_secure: false # Whether to use HTTPS for S3 access
+        s3_object: '' # Limit the import to a single object
+        s3_object_prefix: '' # Prefix for object in S3 bucket
+      autodetect_header: true # Detect whether the CSV file has a header automatically
+      autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
+      autodetect_encoding: true # Detect the CSV file's encoding automatically
+      autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
+      schema_inference_rows: 100000 # Number of rows to use for schema inference
+      encoding: utf-8 # Encoding of the CSV file
+      ignore_decode_errors: false # Ignore errors when decoding the file
+      header: true # First line of the CSV file is its header
+      delimiter: ',' # Character used to separate fields in the file
+      quotechar: '"' # Character used to quote fields
+    tables:
+      sample_table:
+        # Plugin-specific table parameters matching the plugin's schema
+        options:
+          url: ''  # HTTP URL to the CSV file
+          s3_object: '' # S3 object of the CSV file
+          autodetect_header: true # Detect whether the CSV file has a header automatically
+          autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
+          autodetect_encoding: true # Detect the CSV file's encoding automatically
+          autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
+          schema_inference_rows: 100000 # Number of rows to use for schema inference
+          encoding: utf-8 # Encoding of the CSV file
+          ignore_decode_errors: false # Ignore errors when decoding the file
+          header: true # First line of the CSV file is its header
+          delimiter: ',' # Character used to separate fields in the file
+          quotechar: '"' # Character used to quote fields
+        # Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer. 
+        schema: []
+    # Whether live querying is enabled for the plugin (creates a "live" tag in the
+    # repository proxying to the data source). The plugin must support live querying.
+    is_live: true
+    # Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
+    # to trigger ingestion.
+    schedule:
+```  
+
+becomes:
+
+```yaml
+- namespace: my_namespace
+  repository: csv
+  metadata:
+    readme:
+      text: Readme
+    description: Description of the repository
+    topics:
+    - sample_topic
+  external:
+    # No credential required since we're querying a CSV file over HTTP
+    plugin: csv
+    # Plugin-specific parameters matching the plugin's parameters schema
+    params:
+      connection:
+        connection_type: http  # REQUIRED. Constant
+        url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
+      autodetect_header: true # Detect whether the CSV file has a header automatically
+      autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
+      autodetect_encoding: true # Detect the CSV file's encoding automatically
+      autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
+      schema_inference_rows: 100000 # Number of rows to use for schema inference
+      encoding: utf-8 # Encoding of the CSV file
+      ignore_decode_errors: false # Ignore errors when decoding the file
+      header: true # First line of the CSV file is its header
+      delimiter: ',' # Character used to separate fields in the file
+      quotechar: '"' # Character used to quote fields
+    # Automatically infer table parameters
+    tables: {}
+    is_live: true
+```
+
 ### Set up GitHub Actions
 
 Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow

diff --git a/splitgraph/commandline/cloud.py b/splitgraph/commandline/cloud.py
@@ -632,15 +632,15 @@ def load_c(
 
     repo_yaml = load_project(repositories_file)
     repositories = repo_yaml.repositories
+    if limit_repositories:
+        repositories = [
+            r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
+        ]
 
     gql_client = GQLAPIClient(remote)
 
     if not skip_external:
         rest_client = RESTAPIClient(remote)
-        if limit_repositories:
-            repositories = [
-                r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
-            ]
 
         filter_credential_names = [
             r.external.credential for r in repositories if r.external and r.external.credential

diff --git a/splitgraph/commandline/ingestion.py b/splitgraph/commandline/ingestion.py
@@ -142,7 +142,9 @@ def csv_import(
         sample = [[str(i) for i in range(len(sample))]] + sample
 
     type_overrides = dict(override_type or [])
-    sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key)
+    sg_schema = infer_sg_schema(
+        sample, override_types=type_overrides, primary_keys=primary_key, ignore_empty_strings=False
+    )
     logging.debug("Using Splitgraph schema: %r", sg_schema)
 
     # Reset the stream and pass it to COPY FROM STDIN

diff --git a/splitgraph/ingestion/inference.py b/splitgraph/ingestion/inference.py
@@ -48,12 +48,12 @@ def parse_json(json_s: str):
 ]
 
 
-def _infer_column_schema(column_sample: Sequence[str]) -> str:
+def _infer_column_schema(column_sample: Sequence[str], ignore_empty_strings=True) -> str:
     for candidate, converter in _CONVERTERS:
         try:
             seen_value = False
             for c in column_sample:
-                if c == "" or c is None:
+                if (c == "" and ignore_empty_strings) or c is None:
                     continue
 
                 seen_value = True
@@ -73,6 +73,7 @@ def infer_sg_schema(
     sample: Sequence[List[str]],
     override_types: Optional[Dict[str, str]] = None,
     primary_keys: Optional[List[str]] = None,
+    ignore_empty_strings: bool = True,
 ):
     override_types = override_types or {}
     primary_keys = primary_keys or []
@@ -92,7 +93,9 @@ def infer_sg_schema(
         )
 
     for i, (c_name, c_sample) in enumerate(zip(header, columns)):
-        pg_type = override_types.get(c_name, _infer_column_schema(c_sample))
+        pg_type = override_types.get(
+            c_name, _infer_column_schema(c_sample, ignore_empty_strings=ignore_empty_strings)
+        )
 
         result.append(
             TableColumn(

diff --git a/.../project/snapshots/test_dbt/test_generate_dbt_project/splitgraph_template/dbt_project.yml b/.../project/snapshots/test_dbt/test_generate_dbt_project/splitgraph_template/dbt_project.yml
@@ -30,4 +30,4 @@ models:
     # Here as a starting point. You can reference these models downstream in models that actually
     # materialize as tables.
     staging:
-      +materialized: cte
+      +materialized: ephemeral
diff --git a/...test_generation/test_generate_project_no_dbt/generate_project/.github/workflows/build.yml b/...test_generation/test_generate_project_no_dbt/generate_project/.github/workflows/build.yml
@@ -18,8 +18,8 @@ jobs:
       env:
         CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
     - name: Run sgr cloud load to set up metadata and data source settings
-      run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
-        myns/postgres_fdw
+      run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
+        -f splitgraph.credentials.yml myns/postgres_fdw
       shell: bash
   myns_airbyte_postgres:
     name: Build myns/airbyte-postgres

diff --git a/...eneration/test_generate_project_with_dbt/generate_project_dbt/.github/workflows/build.yml b/...eneration/test_generate_project_with_dbt/generate_project_dbt/.github/workflows/build.yml
@@ -18,8 +18,8 @@ jobs:
       env:
         CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
     - name: Run sgr cloud load to set up metadata and data source settings
-      run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
-        myns/postgres_fdw
+      run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
+        -f splitgraph.credentials.yml myns/postgres_fdw
       shell: bash
   myns_airbyte_postgres:
     name: Build myns/airbyte-postgres
@@ -58,7 +58,7 @@ jobs:
         splitgraph_api_secret: ${{ secrets.SPLITGRAPH_API_SECRET }}
     - name: Set up dbt Git URL
       run: echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && sed -i "s|\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g"
-        splitgraph.credentials.yml
+        splitgraph.credentials.yml && sed -i "s|\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml
       shell: bash
       env:
         CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}

diff --git a/...s/test_generation/test_generate_project_with_dbt/generate_project_dbt/README.md b/...s/test_generation/test_generate_project_with_dbt/generate_project_dbt/README.md
@@ -39,6 +39,118 @@ repository and create the following secrets:
     "SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
     your deployment URL if you're on a private deployment).
 
+### Edit `splitgraph.yml`
+
+We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
+parameters JSONSchema. You should review it and add suitable plugin settings:
+
+  - set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
+    options of the data source (by default, it adds a sample table into the project file)
+  - change and customize the `metadata` block
+  - set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
+    and offers a list of alternative subobjects, choose one entry from the list and delete
+    the list itself, leaving the object at the top level.
+
+Example:
+
+```yaml
+- namespace: my_namespace
+  repository: csv
+  # Catalog-specific metadata for the repository. Optional.
+  metadata:
+    readme:
+      text: Readme
+    description: Description of the repository
+    topics:
+    - sample_topic
+  # Data source settings for the repository. Optional.
+  external:
+    # Name of the credential that the plugin uses. This can also be a credential_id if the
+    # credential is already registered on Splitgraph.
+    credential: csv
+    plugin: csv
+    # Plugin-specific parameters matching the plugin's parameters schema
+    params:
+      connection:  # Choose one of:
+      - connection_type: http  # REQUIRED. Constant
+        url: '' # REQUIRED. HTTP URL to the CSV file
+      - connection_type: s3  # REQUIRED. Constant
+        s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
+        s3_bucket: '' # REQUIRED. Bucket the object is in
+        s3_region: '' # Region of the S3 bucket
+        s3_secure: false # Whether to use HTTPS for S3 access
+        s3_object: '' # Limit the import to a single object
+        s3_object_prefix: '' # Prefix for object in S3 bucket
+      autodetect_header: true # Detect whether the CSV file has a header automatically
+      autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
+      autodetect_encoding: true # Detect the CSV file's encoding automatically
+      autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
+      schema_inference_rows: 100000 # Number of rows to use for schema inference
+      encoding: utf-8 # Encoding of the CSV file
+      ignore_decode_errors: false # Ignore errors when decoding the file
+      header: true # First line of the CSV file is its header
+      delimiter: ',' # Character used to separate fields in the file
+      quotechar: '"' # Character used to quote fields
+    tables:
+      sample_table:
+        # Plugin-specific table parameters matching the plugin's schema
+        options:
+          url: ''  # HTTP URL to the CSV file
+          s3_object: '' # S3 object of the CSV file
+          autodetect_header: true # Detect whether the CSV file has a header automatically
+          autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
+          autodetect_encoding: true # Detect the CSV file's encoding automatically
+          autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
+          schema_inference_rows: 100000 # Number of rows to use for schema inference
+          encoding: utf-8 # Encoding of the CSV file
+          ignore_decode_errors: false # Ignore errors when decoding the file
+          header: true # First line of the CSV file is its header
+          delimiter: ',' # Character used to separate fields in the file
+          quotechar: '"' # Character used to quote fields
+        # Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer. 
+        schema: []
+    # Whether live querying is enabled for the plugin (creates a "live" tag in the
+    # repository proxying to the data source). The plugin must support live querying.
+    is_live: true
+    # Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
+    # to trigger ingestion.
+    schedule:
+```  
+
+becomes:
+
+```yaml
+- namespace: my_namespace
+  repository: csv
+  metadata:
+    readme:
+      text: Readme
+    description: Description of the repository
+    topics:
+    - sample_topic
+  external:
+    # No credential required since we're querying a CSV file over HTTP
+    plugin: csv
+    # Plugin-specific parameters matching the plugin's parameters schema
+    params:
+      connection:
+        connection_type: http  # REQUIRED. Constant
+        url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
+      autodetect_header: true # Detect whether the CSV file has a header automatically
+      autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
+      autodetect_encoding: true # Detect the CSV file's encoding automatically
+      autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
+      schema_inference_rows: 100000 # Number of rows to use for schema inference
+      encoding: utf-8 # Encoding of the CSV file
+      ignore_decode_errors: false # Ignore errors when decoding the file
+      header: true # First line of the CSV file is its header
+      delimiter: ',' # Character used to separate fields in the file
+      quotechar: '"' # Character used to quote fields
+    # Automatically infer table parameters
+    tables: {}
+    is_live: true
+```
+
 ### Set up GitHub Actions
 
 Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow

diff --git a/...shots/test_generation/test_generate_project_with_dbt/generate_project_dbt/dbt_project.yml b/...shots/test_generation/test_generate_project_with_dbt/generate_project_dbt/dbt_project.yml
@@ -30,4 +30,4 @@ models:
     # Here as a starting point. You can reference these models downstream in models that actually
     # materialize as tables.
     staging:
-      +materialized: cte
+      +materialized: ephemeral
diff --git a/...pshots/test_generation/test_generate_project_with_dbt/generate_project_dbt/splitgraph.yml b/...pshots/test_generation/test_generate_project_with_dbt/generate_project_dbt/splitgraph.yml
@@ -96,6 +96,7 @@ repositories:
         namespace: myns
         repository: airbyte-postgres
         hash_or_tag: latest
+      git_branch: $THIS_SHA
     is_live: false
     tables: {}
   metadata: