Skip to content

Commit

Permalink
Merge pull request #602 from splitgraph/bugfix/cli-csv-ingestion
Browse files Browse the repository at this point in the history
Various fixes for sgr cloud CLIs
  • Loading branch information
mildbyte authored Dec 30, 2021
2 parents dfdb892 + 85ae2c7 commit ef4c785
Show file tree
Hide file tree
Showing 13 changed files with 253 additions and 20 deletions.
3 changes: 2 additions & 1 deletion splitgraph/cloud/project/dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def generate_dbt_plugin_params(repositories: List[str]) -> Tuple[Dict[str, Any],
# the Git pull URL at action runtime (using GITHUB_TOKEN).
credentials = {"git_url": "$THIS_REPO_URL"}

params = {"sources": [_make_source(r) for r in repositories]}
# Same with the branch: we want to inject the current SHA we're running the action for.
params = {"sources": [_make_source(r) for r in repositories], "git_branch": "$THIS_SHA"}

return params, credentials
5 changes: 3 additions & 2 deletions splitgraph/cloud/project/github_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def generate_job(
{
"name": "Set up dbt Git URL",
"run": 'echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && '
'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml',
'sed -i "s|\\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g" splitgraph.credentials.yml && '
'sed -i "s|\\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml',
"shell": "bash",
"env": {
"CREDENTIALS_YML": "${{secrets.SPLITGRAPH_CREDENTIALS_YML}}",
Expand All @@ -54,7 +55,7 @@ def generate_job(
steps.append(
{
"name": "Run sgr cloud load to set up metadata and data source settings",
"run": "sgr cloud load --remote splitgraph "
"run": "sgr cloud load --remote splitgraph --initial-private "
f"-f splitgraph.yml -f splitgraph.credentials.yml {repository}",
"shell": "bash",
}
Expand Down
114 changes: 113 additions & 1 deletion splitgraph/cloud/project/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
# Here as a starting point. You can reference these models downstream in models that actually
# materialize as tables.
staging:
+materialized: cte
+materialized: ephemeral
"""

SOURCES_YML_TEMPLATE = """# This file defines all data sources referenced by this model. The mapping
Expand Down Expand Up @@ -132,6 +132,118 @@
"SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
your deployment URL if you're on a private deployment).
### Edit `splitgraph.yml`
We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
parameters JSONSchema. You should review it and add suitable plugin settings:
- set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
options of the data source (by default, it adds a sample table into the project file)
- change and customize the `metadata` block
- set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
and offers a list of alternative subobjects, choose one entry from the list and delete
the list itself, leaving the object at the top level.
Example:
```yaml
- namespace: my_namespace
repository: csv
# Catalog-specific metadata for the repository. Optional.
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
# Data source settings for the repository. Optional.
external:
# Name of the credential that the plugin uses. This can also be a credential_id if the
# credential is already registered on Splitgraph.
credential: csv
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection: # Choose one of:
- connection_type: http # REQUIRED. Constant
url: '' # REQUIRED. HTTP URL to the CSV file
- connection_type: s3 # REQUIRED. Constant
s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
s3_bucket: '' # REQUIRED. Bucket the object is in
s3_region: '' # Region of the S3 bucket
s3_secure: false # Whether to use HTTPS for S3 access
s3_object: '' # Limit the import to a single object
s3_object_prefix: '' # Prefix for object in S3 bucket
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
tables:
sample_table:
# Plugin-specific table parameters matching the plugin's schema
options:
url: '' # HTTP URL to the CSV file
s3_object: '' # S3 object of the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer.
schema: []
# Whether live querying is enabled for the plugin (creates a "live" tag in the
# repository proxying to the data source). The plugin must support live querying.
is_live: true
# Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
# to trigger ingestion.
schedule:
```
becomes:
```yaml
- namespace: my_namespace
repository: csv
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
external:
# No credential required since we're querying a CSV file over HTTP
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection:
connection_type: http # REQUIRED. Constant
url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Automatically infer table parameters
tables: {}
is_live: true
```
### Set up GitHub Actions
Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow
Expand Down
8 changes: 4 additions & 4 deletions splitgraph/commandline/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,15 +632,15 @@ def load_c(

repo_yaml = load_project(repositories_file)
repositories = repo_yaml.repositories
if limit_repositories:
repositories = [
r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
]

gql_client = GQLAPIClient(remote)

if not skip_external:
rest_client = RESTAPIClient(remote)
if limit_repositories:
repositories = [
r for r in repositories if f"{r.namespace}/{r.repository}" in limit_repositories
]

filter_credential_names = [
r.external.credential for r in repositories if r.external and r.external.credential
Expand Down
4 changes: 3 additions & 1 deletion splitgraph/commandline/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,9 @@ def csv_import(
sample = [[str(i) for i in range(len(sample))]] + sample

type_overrides = dict(override_type or [])
sg_schema = infer_sg_schema(sample, override_types=type_overrides, primary_keys=primary_key)
sg_schema = infer_sg_schema(
sample, override_types=type_overrides, primary_keys=primary_key, ignore_empty_strings=False
)
logging.debug("Using Splitgraph schema: %r", sg_schema)

# Reset the stream and pass it to COPY FROM STDIN
Expand Down
9 changes: 6 additions & 3 deletions splitgraph/ingestion/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ def parse_json(json_s: str):
]


def _infer_column_schema(column_sample: Sequence[str]) -> str:
def _infer_column_schema(column_sample: Sequence[str], ignore_empty_strings=True) -> str:
for candidate, converter in _CONVERTERS:
try:
seen_value = False
for c in column_sample:
if c == "" or c is None:
if (c == "" and ignore_empty_strings) or c is None:
continue

seen_value = True
Expand All @@ -73,6 +73,7 @@ def infer_sg_schema(
sample: Sequence[List[str]],
override_types: Optional[Dict[str, str]] = None,
primary_keys: Optional[List[str]] = None,
ignore_empty_strings: bool = True,
):
override_types = override_types or {}
primary_keys = primary_keys or []
Expand All @@ -92,7 +93,9 @@ def infer_sg_schema(
)

for i, (c_name, c_sample) in enumerate(zip(header, columns)):
pg_type = override_types.get(c_name, _infer_column_schema(c_sample))
pg_type = override_types.get(
c_name, _infer_column_schema(c_sample, ignore_empty_strings=ignore_empty_strings)
)

result.append(
TableColumn(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ models:
# Here as a starting point. You can reference these models downstream in models that actually
# materialize as tables.
staging:
+materialized: cte
+materialized: ephemeral
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jobs:
env:
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
- name: Run sgr cloud load to set up metadata and data source settings
run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
myns/postgres_fdw
run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
-f splitgraph.credentials.yml myns/postgres_fdw
shell: bash
myns_airbyte_postgres:
name: Build myns/airbyte-postgres
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jobs:
env:
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
- name: Run sgr cloud load to set up metadata and data source settings
run: sgr cloud load --remote splitgraph -f splitgraph.yml -f splitgraph.credentials.yml
myns/postgres_fdw
run: sgr cloud load --remote splitgraph --initial-private -f splitgraph.yml
-f splitgraph.credentials.yml myns/postgres_fdw
shell: bash
myns_airbyte_postgres:
name: Build myns/airbyte-postgres
Expand Down Expand Up @@ -58,7 +58,7 @@ jobs:
splitgraph_api_secret: ${{ secrets.SPLITGRAPH_API_SECRET }}
- name: Set up dbt Git URL
run: echo "$CREDENTIALS_YML" > splitgraph.credentials.yml && sed -i "s|\$THIS_REPO_URL|https://$GITHUB_ACTOR:[email protected]/$GITHUB_REPOSITORY|g"
splitgraph.credentials.yml
splitgraph.credentials.yml && sed -i "s|\$THIS_SHA|$GITHUB_SHA|g" splitgraph.yml
shell: bash
env:
CREDENTIALS_YML: ${{secrets.SPLITGRAPH_CREDENTIALS_YML}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,118 @@ repository and create the following secrets:
"SQL credentials"). You can get them at https://www.splitgraph.com/settings/sql-credentials (or
your deployment URL if you're on a private deployment).

### Edit `splitgraph.yml`

We generated a [`splitgraph.yml`](./splitgraph.yml) file from your chosen plugins'
parameters JSONSchema. You should review it and add suitable plugin settings:

- set `tables` to `tables: {}` to let the plugin automatically infer the schema and the
options of the data source (by default, it adds a sample table into the project file)
- change and customize the `metadata` block
- set up the plugin parameters in `external.params`. Where the comment says `CHOOSE ONE`
and offers a list of alternative subobjects, choose one entry from the list and delete
the list itself, leaving the object at the top level.

Example:

```yaml
- namespace: my_namespace
repository: csv
# Catalog-specific metadata for the repository. Optional.
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
# Data source settings for the repository. Optional.
external:
# Name of the credential that the plugin uses. This can also be a credential_id if the
# credential is already registered on Splitgraph.
credential: csv
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection: # Choose one of:
- connection_type: http # REQUIRED. Constant
url: '' # REQUIRED. HTTP URL to the CSV file
- connection_type: s3 # REQUIRED. Constant
s3_endpoint: '' # REQUIRED. S3 endpoint (including port if required)
s3_bucket: '' # REQUIRED. Bucket the object is in
s3_region: '' # Region of the S3 bucket
s3_secure: false # Whether to use HTTPS for S3 access
s3_object: '' # Limit the import to a single object
s3_object_prefix: '' # Prefix for object in S3 bucket
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
tables:
sample_table:
# Plugin-specific table parameters matching the plugin's schema
options:
url: '' # HTTP URL to the CSV file
s3_object: '' # S3 object of the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Schema of the table, a list of objects with `name` and `type`. If set to `[]`, will infer.
schema: []
# Whether live querying is enabled for the plugin (creates a "live" tag in the
# repository proxying to the data source). The plugin must support live querying.
is_live: true
# Ingestion schedule settings. Disable this if you're using GitHub Actions or other methods
# to trigger ingestion.
schedule:
```
becomes:
```yaml
- namespace: my_namespace
repository: csv
metadata:
readme:
text: Readme
description: Description of the repository
topics:
- sample_topic
external:
# No credential required since we're querying a CSV file over HTTP
plugin: csv
# Plugin-specific parameters matching the plugin's parameters schema
params:
connection:
connection_type: http # REQUIRED. Constant
url: 'https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv' # REQUIRED. HTTP URL to the CSV file
autodetect_header: true # Detect whether the CSV file has a header automatically
autodetect_dialect: true # Detect the CSV file's dialect (separator, quoting characters etc) automatically
autodetect_encoding: true # Detect the CSV file's encoding automatically
autodetect_sample_size: 65536 # Sample size, in bytes, for encoding/dialect/header detection
schema_inference_rows: 100000 # Number of rows to use for schema inference
encoding: utf-8 # Encoding of the CSV file
ignore_decode_errors: false # Ignore errors when decoding the file
header: true # First line of the CSV file is its header
delimiter: ',' # Character used to separate fields in the file
quotechar: '"' # Character used to quote fields
# Automatically infer table parameters
tables: {}
is_live: true
```
### Set up GitHub Actions
Because this repository was itself generated by a GitHub Actions job, we can't edit the workflow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ models:
# Here as a starting point. You can reference these models downstream in models that actually
# materialize as tables.
staging:
+materialized: cte
+materialized: ephemeral
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ repositories:
namespace: myns
repository: airbyte-postgres
hash_or_tag: latest
git_branch: $THIS_SHA
is_live: false
tables: {}
metadata:
Expand Down
Loading

0 comments on commit ef4c785

Please sign in to comment.