From 2ee73b7459069df1420cf4e9661b31dcd5cb5665 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Fri, 1 Sep 2023 12:07:29 +0300 Subject: [PATCH 01/40] athena-driver --- cli/cmd/runtime/start.go | 1 + go.mod | 13 +- go.sum | 13 + runtime/compilers/rillv1beta/connector.go | 4 + runtime/drivers/athena/athena.go | 303 ++++++++++++++++++ .../duckdb/transporter/filestore_to_duckDB.go | 4 +- .../transporter/objectStore_to_duckDB.go | 4 +- runtime/reconcilers/source.go | 4 + .../catalog/artifacts/yaml/objects.go | 10 + .../catalog/migrator/sources/sources.go | 4 + .../sources/modal/AddSourceModal.svelte | 2 +- .../src/features/sources/modal/yupSchemas.ts | 14 + 12 files changed, 366 insertions(+), 10 deletions(-) create mode 100644 runtime/drivers/athena/athena.go diff --git a/cli/cmd/runtime/start.go b/cli/cmd/runtime/start.go index 6b539e74844..50092647d29 100644 --- a/cli/cmd/runtime/start.go +++ b/cli/cmd/runtime/start.go @@ -23,6 +23,7 @@ import ( "golang.org/x/sync/errgroup" // Load connectors and reconcilers for runtime + _ "github.com/rilldata/rill/runtime/drivers/athena" _ "github.com/rilldata/rill/runtime/drivers/bigquery" _ "github.com/rilldata/rill/runtime/drivers/druid" _ "github.com/rilldata/rill/runtime/drivers/duckdb" diff --git a/go.mod b/go.mod index 5061ba4d75d..d19bfa4eff4 100644 --- a/go.mod +++ b/go.mod @@ -86,7 +86,10 @@ require ( moul.io/zapfilter v1.7.0 ) -require google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect +require ( + github.com/aws/aws-sdk-go-v2/service/athena v1.31.6 // indirect + google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect +) require ( cloud.google.com/go v0.110.2 @@ -105,14 +108,14 @@ require ( github.com/andybalholm/brotli v1.0.5 // indirect github.com/apache/arrow/go/v12 v12.0.0 // indirect github.com/apache/thrift v0.18.1 // indirect - github.com/aws/aws-sdk-go-v2 v1.18.0 // indirect + github.com/aws/aws-sdk-go-v2 v1.21.0 // indirect github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 // indirect github.com/aws/aws-sdk-go-v2/config v1.18.25 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.13.24 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.3 // indirect github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.67 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.3.34 // indirect github.com/aws/aws-sdk-go-v2/internal/v4a v1.0.25 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.11 // indirect @@ -123,7 +126,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/sso v1.12.10 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.10 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.19.0 // indirect - github.com/aws/smithy-go v1.13.5 // indirect + github.com/aws/smithy-go v1.14.2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.2.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect diff --git a/go.sum b/go.sum index d7a794d9f43..d5058389b09 100644 --- a/go.sum +++ b/go.sum @@ -598,6 +598,8 @@ github.com/aws/aws-sdk-go-v2 v1.9.1/go.mod h1:cK/D0BBs0b/oWPIcX/Z/obahJK1TT7IPVj github.com/aws/aws-sdk-go-v2 v1.17.4/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= github.com/aws/aws-sdk-go-v2 v1.18.0 h1:882kkTpSFhdgYRKVZ/VCgf7sd0ru57p2JCxz4/oN5RY= github.com/aws/aws-sdk-go-v2 v1.18.0/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= +github.com/aws/aws-sdk-go-v2 v1.21.0 h1:gMT0IW+03wtYJhRqTVYn0wLzwdnK9sRMcxmtfGzRdJc= +github.com/aws/aws-sdk-go-v2 v1.21.0/go.mod h1:/RfNgGmRxI+iFOB1OeJUyxiU+9s88k3pfHvDagGEp0M= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 h1:dK82zF6kkPeCo8J1e+tGx4JdvDIQzj7ygIoLg8WMuGs= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10/go.mod h1:VeTZetY5KRJLuD/7fkQXMU6Mw7H5m/KP2J5Iy9osMno= github.com/aws/aws-sdk-go-v2/config v1.18.12/go.mod h1:J36fOhj1LQBr+O4hJCiT8FwVvieeoSGOtPuvhKlsNu8= @@ -615,15 +617,21 @@ github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.67/go.mod h1:zQClPRIwQZfJl github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.28/go.mod h1:3lwChorpIM/BhImY/hy+Z6jekmN92cXGPI1QJasVPYY= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33 h1:kG5eQilShqmJbv11XL1VpyDbaEJzWxd4zRiCG30GSn4= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33/go.mod h1:7i0PF1ME/2eUPFcjkVIwq+DOygHEoK92t5cDqNgYbIw= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41 h1:22dGT7PneFMx4+b3pz7lMTRyN8ZKH7M2cW4GP9yUS2g= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41/go.mod h1:CrObHAuPneJBlfEJ5T3szXOUkLEThaGfvnhTf33buas= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.22/go.mod h1:EqK7gVrIGAHyZItrD1D8B0ilgwMD1GiWAmbU4u/JHNk= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27 h1:vFQlirhuM8lLlpI7imKOMsjdQLuN9CPi+k44F/OFVsk= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27/go.mod h1:UrHnn3QV/d0pBZ6QBAEQcqFLf8FAzLmoUfPVIueOvoM= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35 h1:SijA0mgjV8E+8G45ltVHs0fvKpTj8xmZJ3VwhGKtUSI= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35/go.mod h1:SJC1nEVVva1g3pHAIdCp7QsRIkMmLAgoDquQ9Rr8kYw= github.com/aws/aws-sdk-go-v2/internal/ini v1.3.29/go.mod h1:TwuqRBGzxjQJIwH16/fOZodwXt2Zxa9/cwJC5ke4j7s= github.com/aws/aws-sdk-go-v2/internal/ini v1.3.34 h1:gGLG7yKaXG02/jBlg210R7VgQIotiQntNhsCFejawx8= github.com/aws/aws-sdk-go-v2/internal/ini v1.3.34/go.mod h1:Etz2dj6UHYuw+Xw830KfzCfWGMzqvUTCjUj5b76GVDc= github.com/aws/aws-sdk-go-v2/internal/v4a v1.0.19/go.mod h1:8W88sW3PjamQpKFUQvHWWKay6ARsNvZnzU7+a4apubw= github.com/aws/aws-sdk-go-v2/internal/v4a v1.0.25 h1:AzwRi5OKKwo4QNqPf7TjeO+tK8AyOK3GVSwmRPo7/Cs= github.com/aws/aws-sdk-go-v2/internal/v4a v1.0.25/go.mod h1:SUbB4wcbSEyCvqBxv/O/IBf93RbEze7U7OnoTlpPB+g= +github.com/aws/aws-sdk-go-v2/service/athena v1.31.6 h1:EFaTu1rBt+KQglDeYRpP1PHot/6xlYzvouxm2aRmrG8= +github.com/aws/aws-sdk-go-v2/service/athena v1.31.6/go.mod h1:DHafyhR8x70ANJZ2RkJx8oeJsfEBqaGwZ591vlihVFQ= github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.8.1/go.mod h1:CM+19rL1+4dFWnOQKwDc7H1KwXTz+h61oUSHyhV0b3o= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.11 h1:y2+VQzC6Zh2ojtV2LoC0MNwHWc6qXv/j2vrQtlftkdA= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.9.11/go.mod h1:iV4q2hsqtNECrfmlXyord9u4zyuFEJX9eLgLpSPzWA8= @@ -656,6 +664,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.19.0/go.mod h1:BgQOMsg8av8jset59jely github.com/aws/smithy-go v1.8.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= github.com/aws/smithy-go v1.13.5/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= +github.com/aws/smithy-go v1.14.2 h1:MJU9hqBGbvWZdApzpvoF2WAIJDbtjK2NDJSiJP7HblQ= +github.com/aws/smithy-go v1.14.2/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= github.com/benbjohnson/clock v1.0.3/go.mod h1:bGMdMPoPVvcYyt1gHDf4J2KE153Yf9BuiUKYMaxlTDM= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= @@ -1996,6 +2006,7 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= @@ -3280,7 +3291,9 @@ modernc.org/cc/v3 v3.40.0/go.mod h1:/bTg4dnWkSXowUO6ssQKnOV0yMVxDYNIsIrzqTFDGH0= modernc.org/ccgo/v3 v3.16.13 h1:Mkgdzl46i5F/CNR/Kj80Ri59hC8TKAhZrYSaqvkwzUw= modernc.org/ccgo/v3 v3.16.13/go.mod h1:2Quk+5YgpImhPjv2Qsob1DnZ/4som1lJTodubIcoUkY= modernc.org/ccorpus v1.11.6 h1:J16RXiiqiCgua6+ZvQot4yUuUy8zxgqbqEEUuGPlISk= +modernc.org/ccorpus v1.11.6/go.mod h1:2gEUTrWqdpH2pXsmTM1ZkjeSrUWDpjMu2T6m29L/ErQ= modernc.org/httpfs v1.0.6 h1:AAgIpFZRXuYnkjftxTAZwMIiwEqAfk8aVB2/oA6nAeM= +modernc.org/httpfs v1.0.6/go.mod h1:7dosgurJGp0sPaRanU53W4xZYKh14wfzX420oZADeHM= modernc.org/libc v1.22.6 h1:cbXU8R+A6aOjRuhsFh3nbDWXO/Hs4ClJRXYB11KmPDo= modernc.org/libc v1.22.6/go.mod h1:jj+Z7dTNX8fBScMVNRAYZ/jF91K8fdT2hYMThc3YjBY= modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= diff --git a/runtime/compilers/rillv1beta/connector.go b/runtime/compilers/rillv1beta/connector.go index eeec8f9608f..9be6f3f873b 100644 --- a/runtime/compilers/rillv1beta/connector.go +++ b/runtime/compilers/rillv1beta/connector.go @@ -184,6 +184,10 @@ func source(connector string, src *runtimev1.Source) drivers.Source { return &drivers.DatabaseSource{ Props: props, } + case "athena": + return &drivers.BucketSource{ + Properties: props, + } default: return nil } diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go new file mode 100644 index 00000000000..1f4fa365364 --- /dev/null +++ b/runtime/drivers/athena/athena.go @@ -0,0 +1,303 @@ +package athena + +import ( + "context" + "errors" + "fmt" + "net/http" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/athena" + "github.com/aws/aws-sdk-go-v2/service/athena/types" + s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/eapache/go-resiliency/retrier" + "github.com/mitchellh/mapstructure" + runtimev1 "github.com/rilldata/rill/proto/gen/rill/runtime/v1" + "github.com/rilldata/rill/runtime/drivers" + rillblob "github.com/rilldata/rill/runtime/drivers/blob" + "go.uber.org/zap" + "gocloud.dev/blob" + "gocloud.dev/blob/s3blob" +) + +const defaultPageSize = 20 + +func init() { + drivers.Register("athena", driver{}) + drivers.RegisterAsConnector("athena", driver{}) +} + +var spec = drivers.Spec{ + DisplayName: "Amazon Athena", + Description: "Connect to Amazon Athena database.", + ServiceAccountDocs: "", + SourceProperties: []drivers.PropertySchema{ + { + Key: "sql", + Type: drivers.StringPropertyType, + Required: true, + DisplayName: "SQL", + Description: "Query to extract data from Athena.", + Placeholder: "select * from catalog.table;", + }, + { + Key: "output.location", + DisplayName: "Output location", + Description: "Oputut location for query results in S3.", + Placeholder: "s3://bucket-name/path/", + Type: drivers.StringPropertyType, + Required: true, + }, + { + Key: "profile.name", + DisplayName: "AWS profile", + Description: "AWS profile for credentials.", + Type: drivers.StringPropertyType, + Required: true, + }, + }, + ConfigProperties: []drivers.PropertySchema{}, +} + +type driver struct{} + +type configProperties struct { + // SecretJSON string `mapstructure:"google_application_credentials"` + // AllowHostAccess bool `mapstructure:"allow_host_access"` +} + +func (d driver) Open(config map[string]any, shared bool, logger *zap.Logger) (drivers.Handle, error) { + if shared { + return nil, fmt.Errorf("athena driver can't be shared") + } + conf := &configProperties{} + err := mapstructure.Decode(config, conf) + if err != nil { + return nil, err + } + + conn := &Connection{ + config: conf, + logger: logger, + } + return conn, nil +} + +func (d driver) Drop(config map[string]any, logger *zap.Logger) error { + return drivers.ErrDropNotSupported +} + +func (d driver) Spec() drivers.Spec { + return spec +} + +func (d driver) HasAnonymousSourceAccess(ctx context.Context, src drivers.Source, logger *zap.Logger) (bool, error) { + return false, fmt.Errorf("not implemented") +} + +type sourceProperties struct { + SQL string `mapstructure:"sql"` + OutputLocation string `mapstructure:"output.location"` + ProfileName string `mapstructure:"profile.name"` +} + +func parseSourceProperties(props map[string]any) (*sourceProperties, error) { + fmt.Println(props) + conf := &sourceProperties{} + err := mapstructure.Decode(props, conf) + if err != nil { + return nil, err + } + + return conf, nil +} + +type Connection struct { + config *configProperties + logger *zap.Logger +} + +var _ drivers.Handle = &Connection{} + +// Driver implements drivers.Connection. +func (c *Connection) Driver() string { + return "athena" +} + +// Config implements drivers.Connection. +func (c *Connection) Config() map[string]any { + m := make(map[string]any, 0) + _ = mapstructure.Decode(c.config, m) + return m +} + +// Close implements drivers.Connection. +func (c *Connection) Close() error { + return nil +} + +// Registry implements drivers.Connection. +func (c *Connection) AsRegistry() (drivers.RegistryStore, bool) { + return nil, false +} + +// Catalog implements drivers.Connection. +func (c *Connection) AsCatalogStore(instanceID string) (drivers.CatalogStore, bool) { + return nil, false +} + +// Repo implements drivers.Connection. +func (c *Connection) AsRepoStore(instanceID string) (drivers.RepoStore, bool) { + return nil, false +} + +// OLAP implements drivers.Connection. +func (c *Connection) AsOLAP(instanceID string) (drivers.OLAPStore, bool) { + return nil, false +} + +// Migrate implements drivers.Connection. +func (c *Connection) Migrate(ctx context.Context) (err error) { + return nil +} + +// MigrationStatus implements drivers.Connection. +func (c *Connection) MigrationStatus(ctx context.Context) (current, desired int, err error) { + return 0, 0, nil +} + +// AsObjectStore implements drivers.Connection. +func (c *Connection) AsObjectStore() (drivers.ObjectStore, bool) { + return c, true +} + +// AsTransporter implements drivers.Connection. +func (c *Connection) AsTransporter(from, to drivers.Handle) (drivers.Transporter, bool) { + return nil, false +} + +func (c *Connection) AsFileStore() (drivers.FileStore, bool) { + return nil, false +} + +// AsSQLStore implements drivers.Connection. +func (c *Connection) AsSQLStore() (drivers.SQLStore, bool) { + return nil, false +} + +// DownloadFiles returns a file iterator over objects stored in gcs. +// The credential json is read from config google_application_credentials. +// Additionally in case `allow_host_credentials` is true it looks for "Application Default Credentials" as well +func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSource) (drivers.FileIterator, error) { + conf, err := parseSourceProperties(source.Properties) + if err != nil { + return nil, fmt.Errorf("failed to parse config: %w", err) + } + + err = c.unload(ctx, conf) + if err != nil { + return nil, fmt.Errorf("failed to unload: %w", err) + } + + path := conf.OutputLocation + "/parquet_output" + bucketObj, err := c.openBucket(ctx, conf, path) + if err != nil { + return nil, fmt.Errorf("cannot open bucket: %w", err) + } + + opts := rillblob.Options{ + ExtractPolicy: &runtimev1.Source_ExtractPolicy{ + // FilesStrategy: runtimev1.Source_ExtractPolicy_STRATEGY_HEAD, + }, + GlobPattern: "**/*", + } + + it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) + if err != nil { + // TODO :: fix this for single file access. for single file first call only happens during download + var failureErr awserr.RequestFailure + if !errors.As(err, &failureErr) { + return nil, fmt.Errorf("failed to create the iterator %w", err) + } + + // check again + if errors.As(err, &failureErr) && (failureErr.StatusCode() == http.StatusForbidden || failureErr.StatusCode() == http.StatusBadRequest) { + return nil, drivers.NewPermissionDeniedError(fmt.Sprintf("can't access remote err: %v", failureErr)) + } + } + + return it, err +} + +func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { + cfg, err := config.LoadDefaultConfig(context.TODO(), func(o *config.LoadOptions) error { + // o.Region = conf.Region + return nil + }, config.WithSharedConfigProfile(conf.ProfileName)) + if err != nil { + return nil, err + } + + s3client := s3v2.NewFromConfig(cfg) + return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) +} + +func (c *Connection) unload(ctx context.Context, conf *sourceProperties) error { + finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, conf.OutputLocation+"/parquet_output/") // todo create folder + + cfg, err := config.LoadDefaultConfig(context.TODO(), func(o *config.LoadOptions) error { + // o.Region = "us-east-2" + return nil + }, config.WithSharedConfigProfile(conf.ProfileName)) + fmt.Println("Executing : ", conf.ProfileName) + + if err != nil { + return err + } + + client := athena.NewFromConfig(cfg) + + resultConfig := &types.ResultConfiguration{ + OutputLocation: aws.String(conf.OutputLocation + "/output/"), + } + + executeParams := &athena.StartQueryExecutionInput{ + QueryString: aws.String(finalSQL), + ResultConfiguration: resultConfig, + } + + // Start Query Execution + athenaExecution, err := client.StartQueryExecution(ctx, executeParams) + + if err != nil { + return err + } + + // Get Query execution and check for the Query state constantly every 2 second + executionID := *athenaExecution.QueryExecutionId + + r := retrier.New(retrier.LimitedExponentialBackoff(10, 100*time.Millisecond, 1*time.Second), nil) // 100 200 400 800 1000 1000 1000 1000 1000 1000 + + return r.Run(func() error { + status, stateErr := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ + QueryExecutionId: &executionID, + }) + + if stateErr != nil { + return stateErr + } + + state := status.QueryExecution.Status.State + + if state == types.QueryExecutionStateSucceeded || state == types.QueryExecutionStateCancelled { + return nil + } else if state == types.QueryExecutionStateFailed { + return fmt.Errorf("Athen query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) + } + return nil + }) +} diff --git a/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go b/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go index c5b8683da5f..e1da3b7ca40 100644 --- a/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go @@ -28,11 +28,11 @@ var _ drivers.Transporter = &fileStoreToDuckDB{} func (t *fileStoreToDuckDB) Transfer(ctx context.Context, source drivers.Source, sink drivers.Sink, opts *drivers.TransferOpts, p drivers.Progress) error { src, ok := source.FileSource() if !ok { - return fmt.Errorf("type of source should `drivers.FilesSource`") + return fmt.Errorf("type of source should be `drivers.FilesSource`") } fSink, ok := sink.DatabaseSink() if !ok { - return fmt.Errorf("type of source should `drivers.DatabaseSink`") + return fmt.Errorf("type of source should be `drivers.DatabaseSink`") } localPaths, err := t.from.FilePaths(ctx, src) diff --git a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go index 06945d4da8a..c96e8bd309b 100644 --- a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go @@ -33,11 +33,11 @@ func NewObjectStoreToDuckDB(from drivers.ObjectStore, to drivers.OLAPStore, logg func (t *objectStoreToDuckDB) Transfer(ctx context.Context, source drivers.Source, sink drivers.Sink, opts *drivers.TransferOpts, p drivers.Progress) error { src, ok := source.BucketSource() if !ok { - return fmt.Errorf("type of source should `drivers.BucketSource`") + return fmt.Errorf("type of source should be `drivers.BucketSource`") } dbSink, ok := sink.DatabaseSink() if !ok { - return fmt.Errorf("type of source should `drivers.DatabaseSink`") + return fmt.Errorf("type of source should be `drivers.DatabaseSink`") } iterator, err := t.from.DownloadFiles(ctx, src) diff --git a/runtime/reconcilers/source.go b/runtime/reconcilers/source.go index 2c8f9632720..8a62c82a0ee 100644 --- a/runtime/reconcilers/source.go +++ b/runtime/reconcilers/source.go @@ -415,6 +415,10 @@ func driversSource(conn drivers.Handle, propsPB *structpb.Struct) (drivers.Sourc SQL: query, Props: props, }, nil + case "athena": + return &drivers.BucketSource{ + Properties: props, + }, nil default: return nil, fmt.Errorf("source connector %q not supported", conn.Driver()) } diff --git a/runtime/services/catalog/artifacts/yaml/objects.go b/runtime/services/catalog/artifacts/yaml/objects.go index a67775089da..867a09d68ae 100644 --- a/runtime/services/catalog/artifacts/yaml/objects.go +++ b/runtime/services/catalog/artifacts/yaml/objects.go @@ -45,6 +45,8 @@ type Source struct { SQL string `yaml:"sql,omitempty" mapstructure:"sql,omitempty"` DB string `yaml:"db,omitempty" mapstructure:"db,omitempty"` ProjectID string `yaml:"project_id,omitempty" mapstructure:"project_id,omitempty"` + AthenaOutputLocation string `yaml:"output.location,omitempty" mapstructure:"output.location,omitempty"` + AthenaProfileName string `yaml:"profile.name,omitempty" mapstructure:"profile.name,omitempty"` } type ExtractPolicy struct { @@ -239,6 +241,14 @@ func fromSourceArtifact(source *Source, path string) (*drivers.CatalogEntry, err props["project_id"] = source.ProjectID } + if source.AthenaOutputLocation != "" { + props["output.location"] = source.AthenaOutputLocation + } + + if source.AthenaProfileName != "" { + props["profile.name"] = source.AthenaProfileName + } + propsPB, err := structpb.NewStruct(props) if err != nil { return nil, err diff --git a/runtime/services/catalog/migrator/sources/sources.go b/runtime/services/catalog/migrator/sources/sources.go index f5c32984e92..39a5b0d31d4 100644 --- a/runtime/services/catalog/migrator/sources/sources.go +++ b/runtime/services/catalog/migrator/sources/sources.go @@ -402,6 +402,10 @@ func source(connector string, src *runtimev1.Source) (drivers.Source, error) { SQL: query, Props: props, }, nil + case "athena": + return &drivers.BucketSource{ + Properties: props, + }, nil default: return nil, fmt.Errorf("connector %v not supported", connector) } diff --git a/web-common/src/features/sources/modal/AddSourceModal.svelte b/web-common/src/features/sources/modal/AddSourceModal.svelte index d85e7ba84d3..2d566b9118d 100644 --- a/web-common/src/features/sources/modal/AddSourceModal.svelte +++ b/web-common/src/features/sources/modal/AddSourceModal.svelte @@ -91,7 +91,7 @@ </TabGroup> </div> <div class="flex-grow overflow-y-auto"> - {#if selectedConnector?.name === "gcs" || selectedConnector?.name === "s3" || selectedConnector?.name === "https" || selectedConnector?.name === "motherduck" || selectedConnector?.name === "bigquery"} + {#if selectedConnector?.name === "gcs" || selectedConnector?.name === "s3" || selectedConnector?.name === "https" || selectedConnector?.name === "motherduck" || selectedConnector?.name === "bigquery" || selectedConnector?.name === "athena"} {#key selectedConnector} <RemoteSourceForm connector={selectedConnector} on:close /> {/key} diff --git a/web-common/src/features/sources/modal/yupSchemas.ts b/web-common/src/features/sources/modal/yupSchemas.ts index 3eb7b57be8e..45412cd33a2 100644 --- a/web-common/src/features/sources/modal/yupSchemas.ts +++ b/web-common/src/features/sources/modal/yupSchemas.ts @@ -69,6 +69,20 @@ export function getYupSchema(connector: V1ConnectorSpec) { .required("Source name is required"), project_id: yup.string().required("project_id is required"), }); + case "athena": + return yup.object().shape({ + sql: yup.string().required("sql is required"), + sourceName: yup + .string() + .matches( + /^[a-zA-Z_][a-zA-Z0-9_]*$/, + "Source name must start with a letter or underscore and contain only letters, numbers, and underscores" + ) + .required("Source name is required"), + output_location: yup.string().required(), + profile_name: yup.string().required(), + }); + default: throw new Error(`Unknown connector: ${connector.name}`); } From 328f2aee561ae40421a08344fb37d647752c8c30 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Fri, 1 Sep 2023 16:26:50 +0300 Subject: [PATCH 02/40] athena-driver --- runtime/drivers/athena/athena.go | 38 +++++++++---------- .../duckdb/transporter/filestore_to_duckDB.go | 2 +- .../transporter/objectStore_to_duckDB.go | 18 ++++++--- runtime/drivers/duckdb/transporter/utils.go | 4 +- 4 files changed, 34 insertions(+), 28 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 1f4fa365364..d918701e26c 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "net/http" + "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -15,6 +16,7 @@ import ( s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/eapache/go-resiliency/retrier" + "github.com/google/uuid" "github.com/mitchellh/mapstructure" runtimev1 "github.com/rilldata/rill/proto/gen/rill/runtime/v1" "github.com/rilldata/rill/runtime/drivers" @@ -48,7 +50,7 @@ var spec = drivers.Spec{ Key: "output.location", DisplayName: "Output location", Description: "Oputut location for query results in S3.", - Placeholder: "s3://bucket-name/path/", + Placeholder: "bucket-name", Type: drivers.StringPropertyType, Required: true, }, @@ -106,7 +108,6 @@ type sourceProperties struct { } func parseSourceProperties(props map[string]any) (*sourceProperties, error) { - fmt.Println(props) conf := &sourceProperties{} err := mapstructure.Decode(props, conf) if err != nil { @@ -198,22 +199,24 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return nil, fmt.Errorf("failed to parse config: %w", err) } - err = c.unload(ctx, conf) + prefix := "parquet_output_" + uuid.New().String() + bucketName := strings.TrimPrefix(strings.TrimRight(conf.OutputLocation, "/"), "s3://") + unloadPath := bucketName + "/" + prefix + err = c.unload(ctx, conf, "s3://"+unloadPath) if err != nil { return nil, fmt.Errorf("failed to unload: %w", err) } - path := conf.OutputLocation + "/parquet_output" - bucketObj, err := c.openBucket(ctx, conf, path) + bucketObj, err := c.openBucket(ctx, conf, bucketName) if err != nil { - return nil, fmt.Errorf("cannot open bucket: %w", err) + return nil, fmt.Errorf("cannot open bucket %q: %w", unloadPath, err) } opts := rillblob.Options{ ExtractPolicy: &runtimev1.Source_ExtractPolicy{ // FilesStrategy: runtimev1.Source_ExtractPolicy_STRATEGY_HEAD, }, - GlobPattern: "**/*", + GlobPattern: prefix + "/**", } it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) @@ -221,7 +224,7 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo // TODO :: fix this for single file access. for single file first call only happens during download var failureErr awserr.RequestFailure if !errors.As(err, &failureErr) { - return nil, fmt.Errorf("failed to create the iterator %w", err) + return nil, fmt.Errorf("failed to create the iterator %q %w", unloadPath, err) } // check again @@ -246,15 +249,10 @@ func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, buc return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) } -func (c *Connection) unload(ctx context.Context, conf *sourceProperties) error { - finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, conf.OutputLocation+"/parquet_output/") // todo create folder - - cfg, err := config.LoadDefaultConfig(context.TODO(), func(o *config.LoadOptions) error { - // o.Region = "us-east-2" - return nil - }, config.WithSharedConfigProfile(conf.ProfileName)) - fmt.Println("Executing : ", conf.ProfileName) +func (c *Connection) unload(ctx context.Context, conf *sourceProperties, path string) error { + finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, path) + cfg, err := config.LoadDefaultConfig(context.TODO(), config.WithSharedConfigProfile(conf.ProfileName)) if err != nil { return err } @@ -262,7 +260,7 @@ func (c *Connection) unload(ctx context.Context, conf *sourceProperties) error { client := athena.NewFromConfig(cfg) resultConfig := &types.ResultConfiguration{ - OutputLocation: aws.String(conf.OutputLocation + "/output/"), + OutputLocation: aws.String("s3://" + strings.TrimPrefix(strings.TrimRight(conf.OutputLocation, "/"), "s3://") + "/output/"), } executeParams := &athena.StartQueryExecutionInput{ @@ -280,7 +278,7 @@ func (c *Connection) unload(ctx context.Context, conf *sourceProperties) error { // Get Query execution and check for the Query state constantly every 2 second executionID := *athenaExecution.QueryExecutionId - r := retrier.New(retrier.LimitedExponentialBackoff(10, 100*time.Millisecond, 1*time.Second), nil) // 100 200 400 800 1000 1000 1000 1000 1000 1000 + r := retrier.New(retrier.LimitedExponentialBackoff(20, 100*time.Millisecond, 1*time.Second), nil) // 100 200 400 800 1000 1000 1000 1000 1000 1000 ... < 20 sec return r.Run(func() error { status, stateErr := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ @@ -296,8 +294,8 @@ func (c *Connection) unload(ctx context.Context, conf *sourceProperties) error { if state == types.QueryExecutionStateSucceeded || state == types.QueryExecutionStateCancelled { return nil } else if state == types.QueryExecutionStateFailed { - return fmt.Errorf("Athen query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) + return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) } - return nil + return fmt.Errorf("Execution is not completed yet, current state: %s", state) }) } diff --git a/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go b/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go index e1da3b7ca40..866e2d0e7d7 100644 --- a/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go @@ -65,7 +65,7 @@ func (t *fileStoreToDuckDB) Transfer(ctx context.Context, source drivers.Source, } // Ingest data - from, err := sourceReader(localPaths, format, ingestionProps) + from, err := sourceReader(localPaths, format, ingestionProps, false) if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go index c96e8bd309b..3d77130dcf2 100644 --- a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go @@ -4,10 +4,12 @@ import ( "context" "errors" "fmt" + "reflect" "strings" "time" "github.com/rilldata/rill/runtime/drivers" + "github.com/rilldata/rill/runtime/drivers/athena" "github.com/rilldata/rill/runtime/pkg/duckdbsql" "github.com/rilldata/rill/runtime/pkg/fileutil" "github.com/rilldata/rill/runtime/pkg/observability" @@ -51,9 +53,10 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, source drivers.Sourc return drivers.ErrIngestionLimitExceeded } + fromAthena := reflect.TypeOf(t.from).AssignableTo(reflect.TypeOf(&athena.Connection{})) sql, hasSQL := src.Properties["sql"].(string) // if sql is specified use ast rewrite to fill in the downloaded files - if hasSQL { + if hasSQL && !fromAthena { return t.ingestDuckDBSQL(ctx, sql, iterator, dbSink, opts, p) } @@ -63,6 +66,9 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, source drivers.Sourc val, formatDefined := src.Properties["format"].(string) if formatDefined { format = fmt.Sprintf(".%s", val) + } else if fromAthena { + format = "parquet" + formatDefined = true } allowSchemaRelaxation, err := schemaRelaxationProperty(src.Properties) @@ -97,11 +103,13 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, source drivers.Sourc st := time.Now() t.logger.Info("ingesting files", zap.Strings("files", files), observability.ZapCtx(ctx)) if appendToTable { - if err := a.appendData(ctx, files, format); err != nil { + if err := a.appendData(ctx, files, format, fromAthena); err != nil { return err } } else { - from, err := sourceReader(files, format, ingestionProps) + var from string + var err error + from, err = sourceReader(files, format, ingestionProps, fromAthena) if err != nil { return err } @@ -142,8 +150,8 @@ func newAppender(to drivers.OLAPStore, sink *drivers.DatabaseSink, ingestionProp } } -func (a *appender) appendData(ctx context.Context, files []string, format string) error { - from, err := sourceReader(files, format, a.ingestionProps) +func (a *appender) appendData(ctx context.Context, files []string, format string, fromAthena bool) error { + from, err := sourceReader(files, format, a.ingestionProps, fromAthena) if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/utils.go b/runtime/drivers/duckdb/transporter/utils.go index c6f9851b71e..3a3aee13233 100644 --- a/runtime/drivers/duckdb/transporter/utils.go +++ b/runtime/drivers/duckdb/transporter/utils.go @@ -27,12 +27,12 @@ func rawConn(conn *sql.Conn, f func(driver.Conn) error) error { }) } -func sourceReader(paths []string, format string, ingestionProps map[string]any) (string, error) { +func sourceReader(paths []string, format string, ingestionProps map[string]any, fromAthena bool) (string, error) { // Generate a "read" statement if containsAny(format, []string{".csv", ".tsv", ".txt"}) { // CSV reader return generateReadCsvStatement(paths, ingestionProps) - } else if strings.Contains(format, ".parquet") { + } else if strings.Contains(format, ".parquet") || fromAthena { // Parquet reader return generateReadParquetStatement(paths, ingestionProps) } else if containsAny(format, []string{".json", ".ndjson"}) { From a047f82cbbfd19043d82803599169f4b28ad659e Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Mon, 4 Sep 2023 12:22:13 +0300 Subject: [PATCH 03/40] athena-driver --- runtime/drivers/athena/athena.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index d918701e26c..c4f2f1620fa 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -10,7 +10,7 @@ import ( "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/config" + awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/athena" "github.com/aws/aws-sdk-go-v2/service/athena/types" s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" @@ -26,8 +26,6 @@ import ( "gocloud.dev/blob/s3blob" ) -const defaultPageSize = 20 - func init() { drivers.Register("athena", driver{}) drivers.RegisterAsConnector("athena", driver{}) @@ -237,10 +235,7 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo } func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { - cfg, err := config.LoadDefaultConfig(context.TODO(), func(o *config.LoadOptions) error { - // o.Region = conf.Region - return nil - }, config.WithSharedConfigProfile(conf.ProfileName)) + cfg, err := awsconfig.LoadDefaultConfig(context.TODO(), awsconfig.WithSharedConfigProfile(conf.ProfileName)) if err != nil { return nil, err } @@ -252,7 +247,7 @@ func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, buc func (c *Connection) unload(ctx context.Context, conf *sourceProperties, path string) error { finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, path) - cfg, err := config.LoadDefaultConfig(context.TODO(), config.WithSharedConfigProfile(conf.ProfileName)) + cfg, err := awsconfig.LoadDefaultConfig(context.TODO(), awsconfig.WithSharedConfigProfile(conf.ProfileName)) if err != nil { return err } From dbba228ce80b1c6d3ce4448afb8facdddfa16cd7 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Mon, 4 Sep 2023 12:50:01 +0300 Subject: [PATCH 04/40] athena-driver --- runtime/drivers/athena/athena.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index c4f2f1620fa..1b2c95d42c2 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -9,7 +9,6 @@ import ( "time" "github.com/aws/aws-sdk-go-v2/aws" - awsconfig "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/athena" "github.com/aws/aws-sdk-go-v2/service/athena/types" @@ -265,7 +264,6 @@ func (c *Connection) unload(ctx context.Context, conf *sourceProperties, path st // Start Query Execution athenaExecution, err := client.StartQueryExecution(ctx, executeParams) - if err != nil { return err } From 985f1955489f8a527de2a2a5287f1936b6e9891f Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 5 Sep 2023 16:01:07 +0300 Subject: [PATCH 05/40] athena-driver review --- runtime/drivers/athena/athena.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 1b2c95d42c2..e65e860d61f 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -262,24 +262,20 @@ func (c *Connection) unload(ctx context.Context, conf *sourceProperties, path st ResultConfiguration: resultConfig, } - // Start Query Execution athenaExecution, err := client.StartQueryExecution(ctx, executeParams) if err != nil { return err } - // Get Query execution and check for the Query state constantly every 2 second - executionID := *athenaExecution.QueryExecutionId + r := retrier.New(retrier.ConstantBackoff(20, 1*time.Second), nil) - r := retrier.New(retrier.LimitedExponentialBackoff(20, 100*time.Millisecond, 1*time.Second), nil) // 100 200 400 800 1000 1000 1000 1000 1000 1000 ... < 20 sec - - return r.Run(func() error { - status, stateErr := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ - QueryExecutionId: &executionID, + return r.RunCtx(ctx, func(ctx context.Context) error { + status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ + QueryExecutionId: athenaExecution.QueryExecutionId, }) - if stateErr != nil { - return stateErr + if err != nil { + return err } state := status.QueryExecution.Status.State From 3db7655b976a7b7040cc131c5617261c1c8a5996 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 5 Sep 2023 18:48:43 +0300 Subject: [PATCH 06/40] athena-driver review --- runtime/drivers/athena/athena.go | 63 ++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index e65e860d61f..e47c480d52c 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -13,6 +13,8 @@ import ( "github.com/aws/aws-sdk-go-v2/service/athena" "github.com/aws/aws-sdk-go-v2/service/athena/types" s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" + s3v2types "github.com/aws/aws-sdk-go-v2/service/s3/types" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/eapache/go-resiliency/retrier" "github.com/google/uuid" @@ -187,46 +189,68 @@ func (c *Connection) AsSQLStore() (drivers.SQLStore, bool) { return nil, false } -// DownloadFiles returns a file iterator over objects stored in gcs. -// The credential json is read from config google_application_credentials. -// Additionally in case `allow_host_credentials` is true it looks for "Application Default Credentials" as well +func cleanPath(ctx context.Context, cfg aws.Config, bucketName, prefix string) error { + s3client := s3v2.NewFromConfig(cfg) + out, err := s3client.ListObjectsV2(ctx, &s3v2.ListObjectsV2Input{ + Bucket: &bucketName, + Prefix: &prefix, + }) + if err != nil { + return err + } + + ids := make([]s3v2types.ObjectIdentifier, 0, len(out.Contents)) + for _, o := range out.Contents { + ids = append(ids, s3v2types.ObjectIdentifier{ + Key: o.Key, + }) + } + _, err = s3client.DeleteObjects(ctx, &s3v2.DeleteObjectsInput{ + Delete: &s3v2types.Delete{ + Objects: ids, + }, + }) + return err +} + func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSource) (drivers.FileIterator, error) { conf, err := parseSourceProperties(source.Properties) if err != nil { return nil, fmt.Errorf("failed to parse config: %w", err) } + cfg, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithSharedConfigProfile(conf.ProfileName)) + if err != nil { + return nil, err + } + prefix := "parquet_output_" + uuid.New().String() bucketName := strings.TrimPrefix(strings.TrimRight(conf.OutputLocation, "/"), "s3://") unloadPath := bucketName + "/" + prefix - err = c.unload(ctx, conf, "s3://"+unloadPath) + err = c.unload(ctx, cfg, conf, "s3://"+unloadPath) if err != nil { - return nil, fmt.Errorf("failed to unload: %w", err) + return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, prefix)) } bucketObj, err := c.openBucket(ctx, conf, bucketName) if err != nil { - return nil, fmt.Errorf("cannot open bucket %q: %w", unloadPath, err) + return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", unloadPath, err), cleanPath(ctx, cfg, bucketName, prefix)) } opts := rillblob.Options{ - ExtractPolicy: &runtimev1.Source_ExtractPolicy{ - // FilesStrategy: runtimev1.Source_ExtractPolicy_STRATEGY_HEAD, - }, - GlobPattern: prefix + "/**", + ExtractPolicy: &runtimev1.Source_ExtractPolicy{}, + GlobPattern: prefix + "/**", } it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) if err != nil { - // TODO :: fix this for single file access. for single file first call only happens during download var failureErr awserr.RequestFailure if !errors.As(err, &failureErr) { - return nil, fmt.Errorf("failed to create the iterator %q %w", unloadPath, err) + return nil, errors.Join(fmt.Errorf("failed to create the iterator %q %w", unloadPath, err), cleanPath(ctx, cfg, bucketName, prefix)) } - // check again if errors.As(err, &failureErr) && (failureErr.StatusCode() == http.StatusForbidden || failureErr.StatusCode() == http.StatusBadRequest) { - return nil, drivers.NewPermissionDeniedError(fmt.Sprintf("can't access remote err: %v", failureErr)) + return nil, errors.Join(drivers.NewPermissionDeniedError(fmt.Sprintf("can't access remote err: %v", failureErr)), cleanPath(ctx, cfg, bucketName, prefix)) } } @@ -234,7 +258,7 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo } func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { - cfg, err := awsconfig.LoadDefaultConfig(context.TODO(), awsconfig.WithSharedConfigProfile(conf.ProfileName)) + cfg, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithSharedConfigProfile(conf.ProfileName)) if err != nil { return nil, err } @@ -243,16 +267,9 @@ func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, buc return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) } -func (c *Connection) unload(ctx context.Context, conf *sourceProperties, path string) error { +func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourceProperties, path string) error { finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, path) - - cfg, err := awsconfig.LoadDefaultConfig(context.TODO(), awsconfig.WithSharedConfigProfile(conf.ProfileName)) - if err != nil { - return err - } - client := athena.NewFromConfig(cfg) - resultConfig := &types.ResultConfiguration{ OutputLocation: aws.String("s3://" + strings.TrimPrefix(strings.TrimRight(conf.OutputLocation, "/"), "s3://") + "/output/"), } From 5c66737b2211bb96289c98386523be75a9658b21 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 5 Sep 2023 18:51:26 +0300 Subject: [PATCH 07/40] athena-driver review --- runtime/drivers/athena/athena.go | 8 ++++---- runtime/services/catalog/artifacts/yaml/objects.go | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index e47c480d52c..50db075f588 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -46,7 +46,7 @@ var spec = drivers.Spec{ Placeholder: "select * from catalog.table;", }, { - Key: "output.location", + Key: "output_location", DisplayName: "Output location", Description: "Oputut location for query results in S3.", Placeholder: "bucket-name", @@ -54,7 +54,7 @@ var spec = drivers.Spec{ Required: true, }, { - Key: "profile.name", + Key: "profile_name", DisplayName: "AWS profile", Description: "AWS profile for credentials.", Type: drivers.StringPropertyType, @@ -102,8 +102,8 @@ func (d driver) HasAnonymousSourceAccess(ctx context.Context, src drivers.Source type sourceProperties struct { SQL string `mapstructure:"sql"` - OutputLocation string `mapstructure:"output.location"` - ProfileName string `mapstructure:"profile.name"` + OutputLocation string `mapstructure:"output_location"` + ProfileName string `mapstructure:"profile_name"` } func parseSourceProperties(props map[string]any) (*sourceProperties, error) { diff --git a/runtime/services/catalog/artifacts/yaml/objects.go b/runtime/services/catalog/artifacts/yaml/objects.go index 867a09d68ae..fafaa0566ab 100644 --- a/runtime/services/catalog/artifacts/yaml/objects.go +++ b/runtime/services/catalog/artifacts/yaml/objects.go @@ -45,8 +45,8 @@ type Source struct { SQL string `yaml:"sql,omitempty" mapstructure:"sql,omitempty"` DB string `yaml:"db,omitempty" mapstructure:"db,omitempty"` ProjectID string `yaml:"project_id,omitempty" mapstructure:"project_id,omitempty"` - AthenaOutputLocation string `yaml:"output.location,omitempty" mapstructure:"output.location,omitempty"` - AthenaProfileName string `yaml:"profile.name,omitempty" mapstructure:"profile.name,omitempty"` + AthenaOutputLocation string `yaml:"output_location,omitempty" mapstructure:"output_location,omitempty"` + AthenaProfileName string `yaml:"profile_name,omitempty" mapstructure:"profile_name,omitempty"` } type ExtractPolicy struct { @@ -242,11 +242,11 @@ func fromSourceArtifact(source *Source, path string) (*drivers.CatalogEntry, err } if source.AthenaOutputLocation != "" { - props["output.location"] = source.AthenaOutputLocation + props["output_location"] = source.AthenaOutputLocation } if source.AthenaProfileName != "" { - props["profile.name"] = source.AthenaProfileName + props["profile_name"] = source.AthenaProfileName } propsPB, err := structpb.NewStruct(props) From 605b7911c1126c25ad53191e9478b6278cd9211b Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 5 Sep 2023 18:55:44 +0300 Subject: [PATCH 08/40] athena-driver review --- runtime/drivers/athena/athena.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 50db075f588..f8f4bf194d8 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -238,8 +238,7 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo } opts := rillblob.Options{ - ExtractPolicy: &runtimev1.Source_ExtractPolicy{}, - GlobPattern: prefix + "/**", + GlobPattern: prefix + "/**", } it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) From 200815505e7174578bdbbac76cc5c2b2c0fbbef1 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 5 Sep 2023 20:34:49 +0300 Subject: [PATCH 09/40] athena-driver review --- runtime/drivers/athena/athena.go | 40 ++++++++++++++----- .../catalog/artifacts/yaml/objects.go | 5 --- .../catalog/migrator/sources/sources.go | 4 ++ .../src/features/sources/modal/yupSchemas.ts | 2 +- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index f8f4bf194d8..7e8c0cb8f64 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -10,6 +10,7 @@ import ( "github.com/aws/aws-sdk-go-v2/aws" awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/service/athena" "github.com/aws/aws-sdk-go-v2/service/athena/types" s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" @@ -19,7 +20,6 @@ import ( "github.com/eapache/go-resiliency/retrier" "github.com/google/uuid" "github.com/mitchellh/mapstructure" - runtimev1 "github.com/rilldata/rill/proto/gen/rill/runtime/v1" "github.com/rilldata/rill/runtime/drivers" rillblob "github.com/rilldata/rill/runtime/drivers/blob" "go.uber.org/zap" @@ -47,28 +47,38 @@ var spec = drivers.Spec{ }, { Key: "output_location", - DisplayName: "Output location", + DisplayName: "S3 output location", Description: "Oputut location for query results in S3.", - Placeholder: "bucket-name", + Placeholder: "mybucket", Type: drivers.StringPropertyType, Required: true, }, { - Key: "profile_name", - DisplayName: "AWS profile", + Key: "region", + DisplayName: "AWS region", Description: "AWS profile for credentials.", Type: drivers.StringPropertyType, Required: true, }, }, - ConfigProperties: []drivers.PropertySchema{}, + ConfigProperties: []drivers.PropertySchema{ + { + Key: "aws_access_key_id", + Secret: true, + }, + { + Key: "aws_secret_access_key", + Secret: true, + }, + }, } type driver struct{} type configProperties struct { - // SecretJSON string `mapstructure:"google_application_credentials"` - // AllowHostAccess bool `mapstructure:"allow_host_access"` + AccessKeyID string `mapstructure:"aws_access_key_id"` + SecretAccessKey string `mapstructure:"aws_secret_access_key"` + SessionToken string `mapstructure:"aws_access_token"` } func (d driver) Open(config map[string]any, shared bool, logger *zap.Logger) (drivers.Handle, error) { @@ -103,7 +113,7 @@ func (d driver) HasAnonymousSourceAccess(ctx context.Context, src drivers.Source type sourceProperties struct { SQL string `mapstructure:"sql"` OutputLocation string `mapstructure:"output_location"` - ProfileName string `mapstructure:"profile_name"` + Region string `mapstructure:"region"` } func parseSourceProperties(props map[string]any) (*sourceProperties, error) { @@ -219,7 +229,11 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return nil, fmt.Errorf("failed to parse config: %w", err) } - cfg, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithSharedConfigProfile(conf.ProfileName)) + cfg, err := awsconfig.LoadDefaultConfig( + ctx, + awsconfig.WithRegion(conf.Region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)) + ) if err != nil { return nil, err } @@ -257,7 +271,11 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo } func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { - cfg, err := awsconfig.LoadDefaultConfig(ctx, awsconfig.WithSharedConfigProfile(conf.ProfileName)) + cfg, err := awsconfig.LoadDefaultConfig( + ctx, + awsconfig.WithRegion(conf.Region), + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)) + ) if err != nil { return nil, err } diff --git a/runtime/services/catalog/artifacts/yaml/objects.go b/runtime/services/catalog/artifacts/yaml/objects.go index fafaa0566ab..271d9541d19 100644 --- a/runtime/services/catalog/artifacts/yaml/objects.go +++ b/runtime/services/catalog/artifacts/yaml/objects.go @@ -46,7 +46,6 @@ type Source struct { DB string `yaml:"db,omitempty" mapstructure:"db,omitempty"` ProjectID string `yaml:"project_id,omitempty" mapstructure:"project_id,omitempty"` AthenaOutputLocation string `yaml:"output_location,omitempty" mapstructure:"output_location,omitempty"` - AthenaProfileName string `yaml:"profile_name,omitempty" mapstructure:"profile_name,omitempty"` } type ExtractPolicy struct { @@ -245,10 +244,6 @@ func fromSourceArtifact(source *Source, path string) (*drivers.CatalogEntry, err props["output_location"] = source.AthenaOutputLocation } - if source.AthenaProfileName != "" { - props["profile_name"] = source.AthenaProfileName - } - propsPB, err := structpb.NewStruct(props) if err != nil { return nil, err diff --git a/runtime/services/catalog/migrator/sources/sources.go b/runtime/services/catalog/migrator/sources/sources.go index 39a5b0d31d4..b171ea305d5 100644 --- a/runtime/services/catalog/migrator/sources/sources.go +++ b/runtime/services/catalog/migrator/sources/sources.go @@ -432,6 +432,10 @@ func connectorVariables(src *runtimev1.Source, env map[string]string, repoRoot s vars["aws_access_key_id"] = env["aws_access_key_id"] vars["aws_secret_access_key"] = env["aws_secret_access_key"] vars["aws_session_token"] = env["aws_session_token"] + case "athena": + vars["aws_access_key_id"] = env["aws_access_key_id"] + vars["aws_secret_access_key"] = env["aws_secret_access_key"] + vars["aws_session_token"] = env["aws_session_token"] case "gcs": vars["google_application_credentials"] = env["google_application_credentials"] case "motherduck": diff --git a/web-common/src/features/sources/modal/yupSchemas.ts b/web-common/src/features/sources/modal/yupSchemas.ts index 45412cd33a2..3ef6d357edb 100644 --- a/web-common/src/features/sources/modal/yupSchemas.ts +++ b/web-common/src/features/sources/modal/yupSchemas.ts @@ -80,7 +80,7 @@ export function getYupSchema(connector: V1ConnectorSpec) { ) .required("Source name is required"), output_location: yup.string().required(), - profile_name: yup.string().required(), + region: yup.string(), }); default: From 00d3ec05c81acd560c9cc61ce9735ff588edafce Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 5 Sep 2023 20:48:23 +0300 Subject: [PATCH 10/40] athena-driver review --- runtime/drivers/athena/athena.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 7e8c0cb8f64..d889d3a9f00 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -257,17 +257,10 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) if err != nil { - var failureErr awserr.RequestFailure - if !errors.As(err, &failureErr) { - return nil, errors.Join(fmt.Errorf("failed to create the iterator %q %w", unloadPath, err), cleanPath(ctx, cfg, bucketName, prefix)) - } - - if errors.As(err, &failureErr) && (failureErr.StatusCode() == http.StatusForbidden || failureErr.StatusCode() == http.StatusBadRequest) { - return nil, errors.Join(drivers.NewPermissionDeniedError(fmt.Sprintf("can't access remote err: %v", failureErr)), cleanPath(ctx, cfg, bucketName, prefix)) - } + return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, prefix)) } - return it, err + return it } func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { From dc520560cc97223c92160c5cab71c9a03287234a Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 6 Sep 2023 10:59:19 +0300 Subject: [PATCH 11/40] athena-driver review --- runtime/drivers/athena/athena.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index d889d3a9f00..cabc72c20a2 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -113,7 +113,7 @@ func (d driver) HasAnonymousSourceAccess(ctx context.Context, src drivers.Source type sourceProperties struct { SQL string `mapstructure:"sql"` OutputLocation string `mapstructure:"output_location"` - Region string `mapstructure:"region"` + Region string `mapstructure:"region"` } func parseSourceProperties(props map[string]any) (*sourceProperties, error) { @@ -230,9 +230,9 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo } cfg, err := awsconfig.LoadDefaultConfig( - ctx, + ctx, awsconfig.WithRegion(conf.Region), - awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)) + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)), ) if err != nil { return nil, err @@ -260,14 +260,14 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, prefix)) } - return it + return it, nil } func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { cfg, err := awsconfig.LoadDefaultConfig( - ctx, + ctx, awsconfig.WithRegion(conf.Region), - awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)) + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)), ) if err != nil { return nil, err From d7e774b7a0f58050fafe4edfa40ff9619b4961a2 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 6 Sep 2023 12:08:27 +0300 Subject: [PATCH 12/40] athena-driver review --- runtime/drivers/athena/athena.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index cabc72c20a2..c934a78d4ff 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "net/http" "strings" "time" @@ -16,7 +15,6 @@ import ( s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" s3v2types "github.com/aws/aws-sdk-go-v2/service/s3/types" - "github.com/aws/aws-sdk-go/aws/awserr" "github.com/eapache/go-resiliency/retrier" "github.com/google/uuid" "github.com/mitchellh/mapstructure" From 29b816e18421b9dff8607406143e1478d461403b Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 6 Sep 2023 13:35:52 +0300 Subject: [PATCH 13/40] Merge remote-tracking branch 'origin/main' into athena-connector --- runtime/drivers/duckdb/transporter/utils.go | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/runtime/drivers/duckdb/transporter/utils.go b/runtime/drivers/duckdb/transporter/utils.go index 32d21aeb857..ee609e148ec 100644 --- a/runtime/drivers/duckdb/transporter/utils.go +++ b/runtime/drivers/duckdb/transporter/utils.go @@ -7,24 +7,6 @@ import ( "strings" ) -// rawConn is similar to *sql.Conn.Raw, but additionally unwraps otelsql (which we use for instrumentation). -func rawConn(conn *sql.Conn, f func(driver.Conn) error) error { - return conn.Raw(func(raw any) error { - // For details, see: https://github.com/XSAM/otelsql/issues/98 - if c, ok := raw.(interface{ Raw() driver.Conn }); ok { - raw = c.Raw() - } - - // This is currently guaranteed, but adding check to be safe - driverConn, ok := raw.(driver.Conn) - if !ok { - return fmt.Errorf("internal: did not obtain a driver.Conn") - } - - return f(driverConn) - }) -} - func sourceReader(paths []string, format string, ingestionProps map[string]any, fromAthena bool) (string, error) { // Generate a "read" statement if containsAny(format, []string{".csv", ".tsv", ".txt"}) { From e5f1794b313f87e6c748c05d03b3b7475901c9bc Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 6 Sep 2023 13:43:00 +0300 Subject: [PATCH 14/40] Merge remote-tracking branch 'origin/main' into athena-connector --- runtime/drivers/athena/athena.go | 5 ++--- runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index c934a78d4ff..ea3a12a7789 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -14,12 +14,12 @@ import ( "github.com/aws/aws-sdk-go-v2/service/athena/types" s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" s3v2types "github.com/aws/aws-sdk-go-v2/service/s3/types" - "github.com/eapache/go-resiliency/retrier" "github.com/google/uuid" "github.com/mitchellh/mapstructure" "github.com/rilldata/rill/runtime/drivers" rillblob "github.com/rilldata/rill/runtime/drivers/blob" + "github.com/rilldata/rill/runtime/pkg/activity" "go.uber.org/zap" "gocloud.dev/blob" "gocloud.dev/blob/s3blob" @@ -79,7 +79,7 @@ type configProperties struct { SessionToken string `mapstructure:"aws_access_token"` } -func (d driver) Open(config map[string]any, shared bool, logger *zap.Logger) (drivers.Handle, error) { +func (d driver) Open(config map[string]any, shared bool, client activity.Client, logger *zap.Logger) (drivers.Handle, error) { if shared { return nil, fmt.Errorf("athena driver can't be shared") } @@ -298,7 +298,6 @@ func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourcePro status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ QueryExecutionId: athenaExecution.QueryExecutionId, }) - if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go index 0f295d1bfef..5d0de761bf6 100644 --- a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go @@ -62,7 +62,7 @@ func (s *sqlStoreToDuckDB) Transfer(ctx context.Context, source drivers.Source, } format := fileutil.FullExt(files[0]) - from, err := sourceReader(files, format, make(map[string]any)) + from, err := sourceReader(files, format, make(map[string]any), false) if err != nil { return err } From 549cf675206da35a594f116bbb3a0d9a6d15877d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Egelund-M=C3=BCller?= <b@egelund-muller.com> Date: Thu, 7 Sep 2023 11:19:59 +0200 Subject: [PATCH 15/40] Run go mod tidy --- go.mod | 14 ++++++-------- go.sum | 7 ------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index 9d32dca8279..d23c70a8746 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/apache/arrow/go/v13 v13.0.0 github.com/apache/calcite-avatica-go/v5 v5.2.0 github.com/aws/aws-sdk-go v1.44.268 + github.com/aws/aws-sdk-go-v2/service/athena v1.31.6 github.com/benbjohnson/clock v1.3.5 github.com/bmatcuk/doublestar/v4 v4.6.0 github.com/bradleyfalzon/ghinstallation/v2 v2.4.0 @@ -86,10 +87,7 @@ require ( moul.io/zapfilter v1.7.0 ) -require ( - github.com/aws/aws-sdk-go-v2/service/athena v1.31.6 // indirect - google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect -) +require google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect require ( cloud.google.com/go v0.110.2 // indirect @@ -107,10 +105,10 @@ require ( github.com/alicebob/gopher-json v0.0.0-20230218143504-906a9b012302 // indirect github.com/andybalholm/brotli v1.0.5 // indirect github.com/apache/thrift v0.18.1 // indirect - github.com/aws/aws-sdk-go-v2 v1.21.0 // indirect + github.com/aws/aws-sdk-go-v2 v1.21.0 github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.10 // indirect - github.com/aws/aws-sdk-go-v2/config v1.18.25 // indirect - github.com/aws/aws-sdk-go-v2/credentials v1.13.24 // indirect + github.com/aws/aws-sdk-go-v2/config v1.18.25 + github.com/aws/aws-sdk-go-v2/credentials v1.13.24 github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.13.3 // indirect github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.67 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41 // indirect @@ -121,7 +119,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.1.28 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.27 // indirect github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.14.2 // indirect - github.com/aws/aws-sdk-go-v2/service/s3 v1.33.1 // indirect + github.com/aws/aws-sdk-go-v2/service/s3 v1.33.1 github.com/aws/aws-sdk-go-v2/service/sso v1.12.10 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.14.10 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.19.0 // indirect diff --git a/go.sum b/go.sum index 1fd02bb6401..dd66628f700 100644 --- a/go.sum +++ b/go.sum @@ -592,7 +592,6 @@ github.com/aws/aws-sdk-go v1.44.268 h1:WoK20tlAvsvQzTcE6TajoprbXmTbcud6MjhErL4P/ github.com/aws/aws-sdk-go v1.44.268/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/aws-sdk-go-v2 v1.9.1/go.mod h1:cK/D0BBs0b/oWPIcX/Z/obahJK1TT7IPVjy53i/mX/4= github.com/aws/aws-sdk-go-v2 v1.17.4/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= -github.com/aws/aws-sdk-go-v2 v1.18.0 h1:882kkTpSFhdgYRKVZ/VCgf7sd0ru57p2JCxz4/oN5RY= github.com/aws/aws-sdk-go-v2 v1.18.0/go.mod h1:uzbQtefpm44goOPmdKyAlXSNcwlRgF3ePWVW6EtJvvw= github.com/aws/aws-sdk-go-v2 v1.21.0 h1:gMT0IW+03wtYJhRqTVYn0wLzwdnK9sRMcxmtfGzRdJc= github.com/aws/aws-sdk-go-v2 v1.21.0/go.mod h1:/RfNgGmRxI+iFOB1OeJUyxiU+9s88k3pfHvDagGEp0M= @@ -611,12 +610,10 @@ github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.51/go.mod h1:7Grl2gV+dx9SW github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.67 h1:fI9/5BDEaAv/pv1VO1X1n3jfP9it+IGqWsCuuBQI8wM= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.67/go.mod h1:zQClPRIwQZfJlZq6WZve+s4Tb4JW+3V6eS+4+KrYeP8= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.28/go.mod h1:3lwChorpIM/BhImY/hy+Z6jekmN92cXGPI1QJasVPYY= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33 h1:kG5eQilShqmJbv11XL1VpyDbaEJzWxd4zRiCG30GSn4= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.33/go.mod h1:7i0PF1ME/2eUPFcjkVIwq+DOygHEoK92t5cDqNgYbIw= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41 h1:22dGT7PneFMx4+b3pz7lMTRyN8ZKH7M2cW4GP9yUS2g= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.41/go.mod h1:CrObHAuPneJBlfEJ5T3szXOUkLEThaGfvnhTf33buas= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.22/go.mod h1:EqK7gVrIGAHyZItrD1D8B0ilgwMD1GiWAmbU4u/JHNk= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27 h1:vFQlirhuM8lLlpI7imKOMsjdQLuN9CPi+k44F/OFVsk= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.27/go.mod h1:UrHnn3QV/d0pBZ6QBAEQcqFLf8FAzLmoUfPVIueOvoM= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35 h1:SijA0mgjV8E+8G45ltVHs0fvKpTj8xmZJ3VwhGKtUSI= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.35/go.mod h1:SJC1nEVVva1g3pHAIdCp7QsRIkMmLAgoDquQ9Rr8kYw= @@ -658,7 +655,6 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.18.3/go.mod h1:b+psTJn33Q4qGoDaM7ZiO github.com/aws/aws-sdk-go-v2/service/sts v1.19.0 h1:2DQLAKDteoEDI8zpCzqBMaZlJuoE9iTYD0gFmXVax9E= github.com/aws/aws-sdk-go-v2/service/sts v1.19.0/go.mod h1:BgQOMsg8av8jset59jelyPW7NoZcZXLVpDsXunGDrk8= github.com/aws/smithy-go v1.8.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= -github.com/aws/smithy-go v1.13.5 h1:hgz0X/DX0dGqTYpGALqXJoRKRj5oQ7150i5FdTePzO8= github.com/aws/smithy-go v1.13.5/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= github.com/aws/smithy-go v1.14.2 h1:MJU9hqBGbvWZdApzpvoF2WAIJDbtjK2NDJSiJP7HblQ= github.com/aws/smithy-go v1.14.2/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= @@ -2006,7 +2002,6 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= -github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= @@ -3291,9 +3286,7 @@ modernc.org/cc/v3 v3.40.0/go.mod h1:/bTg4dnWkSXowUO6ssQKnOV0yMVxDYNIsIrzqTFDGH0= modernc.org/ccgo/v3 v3.16.13 h1:Mkgdzl46i5F/CNR/Kj80Ri59hC8TKAhZrYSaqvkwzUw= modernc.org/ccgo/v3 v3.16.13/go.mod h1:2Quk+5YgpImhPjv2Qsob1DnZ/4som1lJTodubIcoUkY= modernc.org/ccorpus v1.11.6 h1:J16RXiiqiCgua6+ZvQot4yUuUy8zxgqbqEEUuGPlISk= -modernc.org/ccorpus v1.11.6/go.mod h1:2gEUTrWqdpH2pXsmTM1ZkjeSrUWDpjMu2T6m29L/ErQ= modernc.org/httpfs v1.0.6 h1:AAgIpFZRXuYnkjftxTAZwMIiwEqAfk8aVB2/oA6nAeM= -modernc.org/httpfs v1.0.6/go.mod h1:7dosgurJGp0sPaRanU53W4xZYKh14wfzX420oZADeHM= modernc.org/libc v1.22.6 h1:cbXU8R+A6aOjRuhsFh3nbDWXO/Hs4ClJRXYB11KmPDo= modernc.org/libc v1.22.6/go.mod h1:jj+Z7dTNX8fBScMVNRAYZ/jF91K8fdT2hYMThc3YjBY= modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ= From e04060636102d42c50a23ef269241aef71b9713f Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Mon, 11 Sep 2023 19:37:29 +0300 Subject: [PATCH 16/40] athena-driver review --- runtime/drivers/athena/athena.go | 2 +- .../src/features/sources/modal/submitRemoteSourceForm.ts | 3 ++- web-common/src/features/sources/modal/yupSchemas.ts | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index ea3a12a7789..0062c56dae9 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -54,7 +54,7 @@ var spec = drivers.Spec{ { Key: "region", DisplayName: "AWS region", - Description: "AWS profile for credentials.", + Description: "AWS region", Type: drivers.StringPropertyType, Required: true, }, diff --git a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts index 82a7f8f9d6b..31f22b6d946 100644 --- a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts +++ b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts @@ -63,7 +63,8 @@ export async function submitRemoteSourceForm( const formValues = Object.fromEntries( Object.entries(values).map(([key, value]) => { switch (key) { - case "project_id": + case "project_id": + case "output_location": return [key, value]; default: return [fromYupFriendlyKey(key), value]; diff --git a/web-common/src/features/sources/modal/yupSchemas.ts b/web-common/src/features/sources/modal/yupSchemas.ts index 3ef6d357edb..a67ae145466 100644 --- a/web-common/src/features/sources/modal/yupSchemas.ts +++ b/web-common/src/features/sources/modal/yupSchemas.ts @@ -80,7 +80,7 @@ export function getYupSchema(connector: V1ConnectorSpec) { ) .required("Source name is required"), output_location: yup.string().required(), - region: yup.string(), + region: yup.string().required(), }); default: From 804f03e6a3796743fddd7d6a8c90f68f996f7106 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Mon, 11 Sep 2023 19:52:48 +0300 Subject: [PATCH 17/40] athena-driver review --- runtime/drivers/athena/athena.go | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 0062c56dae9..b36bf6685be 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -47,7 +47,7 @@ var spec = drivers.Spec{ Key: "output_location", DisplayName: "S3 output location", Description: "Oputut location for query results in S3.", - Placeholder: "mybucket", + Placeholder: "s3://bucket-name/path/", Type: drivers.StringPropertyType, Required: true, }, @@ -236,26 +236,31 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return nil, err } - prefix := "parquet_output_" + uuid.New().String() - bucketName := strings.TrimPrefix(strings.TrimRight(conf.OutputLocation, "/"), "s3://") - unloadPath := bucketName + "/" + prefix - err = c.unload(ctx, cfg, conf, "s3://"+unloadPath) + unloadPath := "parquet_output_" + uuid.New().String() + bucketName := strings.Split(strings.TrimPrefix(conf.OutputLocation, "s3://"), "/")[0] + unloadLocation := strings.TrimRight(conf.OutputLocation, "/") + "/" + unloadPath + err = c.unload(ctx, cfg, conf, unloadLocation) if err != nil { - return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, prefix)) + return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, unloadPath)) } bucketObj, err := c.openBucket(ctx, conf, bucketName) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", unloadPath, err), cleanPath(ctx, cfg, bucketName, prefix)) + return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", unloadLocation, err), cleanPath(ctx, cfg, bucketName, unloadPath)) } opts := rillblob.Options{ - GlobPattern: prefix + "/**", + GlobPattern: unloadPath + "/**", } it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, prefix)) + return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, unloadPath)) + } + + err = cleanPath(ctx, cfg, bucketName, unloadPath) + if err != nil { + return nil, fmt.Errorf("failed to clean path: %w", err) } return it, nil @@ -279,7 +284,7 @@ func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourcePro finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, path) client := athena.NewFromConfig(cfg) resultConfig := &types.ResultConfiguration{ - OutputLocation: aws.String("s3://" + strings.TrimPrefix(strings.TrimRight(conf.OutputLocation, "/"), "s3://") + "/output/"), + OutputLocation: aws.String(strings.TrimRight(conf.OutputLocation, "/") + "/output/"), } executeParams := &athena.StartQueryExecutionInput{ From 844877f016bf8d35bbeb7e62af3b689e42820489 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 12 Sep 2023 18:18:41 +0300 Subject: [PATCH 18/40] athena-driver review --- runtime/drivers/athena/athena.go | 57 +++++++++++++------ .../catalog/artifacts/yaml/objects.go | 9 ++- .../sources/modal/submitRemoteSourceForm.ts | 3 +- .../src/features/sources/modal/yupSchemas.ts | 4 +- 4 files changed, 50 insertions(+), 23 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index b36bf6685be..30ac103255c 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -44,7 +44,7 @@ var spec = drivers.Spec{ Placeholder: "select * from catalog.table;", }, { - Key: "output_location", + Key: "athena_output_location", DisplayName: "S3 output location", Description: "Oputut location for query results in S3.", Placeholder: "s3://bucket-name/path/", @@ -52,11 +52,11 @@ var spec = drivers.Spec{ Required: true, }, { - Key: "region", - DisplayName: "AWS region", - Description: "AWS region", + Key: "athena_workgroup", + DisplayName: "AWS Athena workgroup", + Description: "AWS Athena workgroup to use for queries.", Type: drivers.StringPropertyType, - Required: true, + Required: false, }, }, ConfigProperties: []drivers.PropertySchema{ @@ -77,6 +77,7 @@ type configProperties struct { AccessKeyID string `mapstructure:"aws_access_key_id"` SecretAccessKey string `mapstructure:"aws_secret_access_key"` SessionToken string `mapstructure:"aws_access_token"` + AllowHostAccess bool `mapstructure:"allow_host_access"` } func (d driver) Open(config map[string]any, shared bool, client activity.Client, logger *zap.Logger) (drivers.Handle, error) { @@ -105,13 +106,13 @@ func (d driver) Spec() drivers.Spec { } func (d driver) HasAnonymousSourceAccess(ctx context.Context, src drivers.Source, logger *zap.Logger) (bool, error) { - return false, fmt.Errorf("not implemented") + return false, nil } type sourceProperties struct { SQL string `mapstructure:"sql"` - OutputLocation string `mapstructure:"output_location"` - Region string `mapstructure:"region"` + OutputLocation string `mapstructure:"athena_output_location"` + WorkGroup string `mapstructure:"athena_workgroup"` } func parseSourceProperties(props map[string]any) (*sourceProperties, error) { @@ -227,11 +228,7 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return nil, fmt.Errorf("failed to parse config: %w", err) } - cfg, err := awsconfig.LoadDefaultConfig( - ctx, - awsconfig.WithRegion(conf.Region), - awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)), - ) + cfg, err := c.Cfg(ctx) if err != nil { return nil, err } @@ -266,12 +263,28 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return it, nil } +func (c *Connection) Cfg(ctx context.Context) (aws.Config, error) { + var cfg aws.Config + var err error + if c.config.AllowHostAccess { + cfg, err = awsconfig.LoadDefaultConfig( + ctx, + ) + } else { + cfg, err = awsconfig.LoadDefaultConfig( + ctx, + awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)), + ) + } + if err != nil { + return aws.Config{}, err + } + + return cfg, nil +} + func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { - cfg, err := awsconfig.LoadDefaultConfig( - ctx, - awsconfig.WithRegion(conf.Region), - awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)), - ) + cfg, err := c.Cfg(ctx) if err != nil { return nil, err } @@ -283,6 +296,7 @@ func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, buc func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourceProperties, path string) error { finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, path) client := athena.NewFromConfig(cfg) + resultConfig := &types.ResultConfiguration{ OutputLocation: aws.String(strings.TrimRight(conf.OutputLocation, "/") + "/output/"), } @@ -292,6 +306,13 @@ func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourcePro ResultConfiguration: resultConfig, } + if conf.WorkGroup != "" { + executeParams = &athena.StartQueryExecutionInput{ + QueryString: aws.String(finalSQL), + WorkGroup: aws.String(conf.WorkGroup), + } + } + athenaExecution, err := client.StartQueryExecution(ctx, executeParams) if err != nil { return err diff --git a/runtime/services/catalog/artifacts/yaml/objects.go b/runtime/services/catalog/artifacts/yaml/objects.go index 271d9541d19..d74fbfc79c2 100644 --- a/runtime/services/catalog/artifacts/yaml/objects.go +++ b/runtime/services/catalog/artifacts/yaml/objects.go @@ -45,7 +45,8 @@ type Source struct { SQL string `yaml:"sql,omitempty" mapstructure:"sql,omitempty"` DB string `yaml:"db,omitempty" mapstructure:"db,omitempty"` ProjectID string `yaml:"project_id,omitempty" mapstructure:"project_id,omitempty"` - AthenaOutputLocation string `yaml:"output_location,omitempty" mapstructure:"output_location,omitempty"` + AthenaOutputLocation string `yaml:"athena_output_location,omitempty" mapstructure:"athena_output_location,omitempty"` + AthenaWorkgroup string `yaml:"athena_workgroup,omitempty" mapstructure:"athena_workgroup,omitempty"` } type ExtractPolicy struct { @@ -241,7 +242,11 @@ func fromSourceArtifact(source *Source, path string) (*drivers.CatalogEntry, err } if source.AthenaOutputLocation != "" { - props["output_location"] = source.AthenaOutputLocation + props["athena_output_location"] = source.AthenaOutputLocation + } + + if source.AthenaWorkgroup != "" { + props["athena_workgroup"] = source.AthenaWorkgroup } propsPB, err := structpb.NewStruct(props) diff --git a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts index 31f22b6d946..1eb5872b2eb 100644 --- a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts +++ b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts @@ -64,7 +64,8 @@ export async function submitRemoteSourceForm( Object.entries(values).map(([key, value]) => { switch (key) { case "project_id": - case "output_location": + case "athena_output_location": + case "athena_workgroup": return [key, value]; default: return [fromYupFriendlyKey(key), value]; diff --git a/web-common/src/features/sources/modal/yupSchemas.ts b/web-common/src/features/sources/modal/yupSchemas.ts index a67ae145466..9474266edba 100644 --- a/web-common/src/features/sources/modal/yupSchemas.ts +++ b/web-common/src/features/sources/modal/yupSchemas.ts @@ -79,8 +79,8 @@ export function getYupSchema(connector: V1ConnectorSpec) { "Source name must start with a letter or underscore and contain only letters, numbers, and underscores" ) .required("Source name is required"), - output_location: yup.string().required(), - region: yup.string().required(), + athena_output_location: yup.string().required(), + athena_workgroup: yup.string(), }); default: From f0bbee98aa5c85d5f8f71b89cd5f4050ad5d87c1 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Tue, 12 Sep 2023 20:44:29 +0300 Subject: [PATCH 19/40] athena-driver review --- runtime/drivers/athena/athena.go | 41 ++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 30ac103255c..59059dd8a8c 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -222,13 +222,30 @@ func cleanPath(ctx context.Context, cfg aws.Config, bucketName, prefix string) e return err } +type janitorIterator struct { + drivers.FileIterator + ctx context.Context + cfg aws.Config + bucketName string + unloadPath string +} + +func (ci janitorIterator) Close() error { + err := ci.FileIterator.Close() + if err != nil { + return err + } + + return cleanPath(ci.ctx, ci.cfg, ci.bucketName, ci.unloadPath) +} + func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSource) (drivers.FileIterator, error) { conf, err := parseSourceProperties(source.Properties) if err != nil { return nil, fmt.Errorf("failed to parse config: %w", err) } - cfg, err := c.Cfg(ctx) + cfg, err := c.newCfg(ctx) if err != nil { return nil, err } @@ -255,15 +272,16 @@ func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSo return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, unloadPath)) } - err = cleanPath(ctx, cfg, bucketName, unloadPath) - if err != nil { - return nil, fmt.Errorf("failed to clean path: %w", err) - } - - return it, nil + return janitorIterator{ + FileIterator: it, + ctx: ctx, + unloadPath: unloadPath, + bucketName: bucketName, + cfg: cfg, + }, nil } -func (c *Connection) Cfg(ctx context.Context) (aws.Config, error) { +func (c *Connection) newCfg(ctx context.Context) (aws.Config, error) { var cfg aws.Config var err error if c.config.AllowHostAccess { @@ -284,7 +302,7 @@ func (c *Connection) Cfg(ctx context.Context) (aws.Config, error) { } func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { - cfg, err := c.Cfg(ctx) + cfg, err := c.newCfg(ctx) if err != nil { return nil, err } @@ -318,8 +336,7 @@ func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourcePro return err } - r := retrier.New(retrier.ConstantBackoff(20, 1*time.Second), nil) - + r := retrier.New(retrier.ConstantBackoff(int(5*time.Minute/time.Second), time.Second), nil) // 5 minutes timeout return r.RunCtx(ctx, func(ctx context.Context) error { status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ QueryExecutionId: athenaExecution.QueryExecutionId, @@ -335,6 +352,6 @@ func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourcePro } else if state == types.QueryExecutionStateFailed { return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) } - return fmt.Errorf("Execution is not completed yet, current state: %s", state) + return fmt.Errorf("Athena ingestion timeout") }) } From 03d5c42b9fecd6a9f9c5bfc74e4dc387fd8abe4d Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 13 Sep 2023 17:29:23 +0300 Subject: [PATCH 20/40] Merge remote-tracking branch 'origin/main' into athena-connector --- .../transporter/objectStore_to_duckDB.go | 4 +-- runtime/drivers/duckdb/transporter/utils.go | 30 +++++++++++++++++++ .../sources/modal/AddSourceModal.svelte | 1 + 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go index cf28daac58e..e6b04a03b95 100644 --- a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go @@ -51,10 +51,10 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps } fromAthena := reflect.TypeOf(t.from).AssignableTo(reflect.TypeOf(&athena.Connection{})) - sql, hasSQL := src.Properties["sql"].(string) + sql, hasSQL := srcProps["sql"].(string) // if sql is specified use ast rewrite to fill in the downloaded files if hasSQL && !fromAthena { - return t.ingestDuckDBSQL(ctx, sql, iterator, dbSink, opts, p) + return t.ingestDuckDBSQL(ctx, sql, iterator, sinkCfg, opts, p) } p.Target(size, drivers.ProgressUnitByte) diff --git a/runtime/drivers/duckdb/transporter/utils.go b/runtime/drivers/duckdb/transporter/utils.go index ee609e148ec..f23b26fbe3d 100644 --- a/runtime/drivers/duckdb/transporter/utils.go +++ b/runtime/drivers/duckdb/transporter/utils.go @@ -5,8 +5,38 @@ import ( "os" "path/filepath" "strings" + + "github.com/mitchellh/mapstructure" ) +type sourceProperties struct { + Database string `mapstructure:"db"` + SQL string `mapstructure:"sql"` +} + +func parseSourceProperties(props map[string]any) (*sourceProperties, error) { + cfg := &sourceProperties{} + if err := mapstructure.Decode(props, cfg); err != nil { + return nil, fmt.Errorf("failed to parse source properties: %w", err) + } + if cfg.SQL == "" { + return nil, fmt.Errorf("property 'sql' is mandatory") + } + return cfg, nil +} + +type sinkProperties struct { + Table string `mapstructure:"table"` +} + +func parseSinkProperties(props map[string]any) (*sinkProperties, error) { + cfg := &sinkProperties{} + if err := mapstructure.Decode(props, cfg); err != nil { + return nil, fmt.Errorf("failed to parse sink properties: %w", err) + } + return cfg, nil +} + func sourceReader(paths []string, format string, ingestionProps map[string]any, fromAthena bool) (string, error) { // Generate a "read" statement if containsAny(format, []string{".csv", ".tsv", ".txt"}) { diff --git a/web-common/src/features/sources/modal/AddSourceModal.svelte b/web-common/src/features/sources/modal/AddSourceModal.svelte index 2d566b9118d..9be5042f5eb 100644 --- a/web-common/src/features/sources/modal/AddSourceModal.svelte +++ b/web-common/src/features/sources/modal/AddSourceModal.svelte @@ -28,6 +28,7 @@ "local_file", "motherduck", "bigquery", + "athena", ]; const connectors = createRuntimeServiceListConnectors({ From a5a5146ab72daa0bddb243b127cbf252d5f06826 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 13 Sep 2023 17:37:13 +0300 Subject: [PATCH 21/40] Merge remote-tracking branch 'origin/main' into athena-connector --- runtime/drivers/athena/athena.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 59059dd8a8c..184ab6335a2 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -105,7 +105,7 @@ func (d driver) Spec() drivers.Spec { return spec } -func (d driver) HasAnonymousSourceAccess(ctx context.Context, src drivers.Source, logger *zap.Logger) (bool, error) { +func (d driver) HasAnonymousSourceAccess(ctx context.Context, src map[string]any, logger *zap.Logger) (bool, error) { return false, nil } @@ -239,8 +239,8 @@ func (ci janitorIterator) Close() error { return cleanPath(ci.ctx, ci.cfg, ci.bucketName, ci.unloadPath) } -func (c *Connection) DownloadFiles(ctx context.Context, source *drivers.BucketSource) (drivers.FileIterator, error) { - conf, err := parseSourceProperties(source.Properties) +func (c *Connection) DownloadFiles(ctx context.Context, props map[string]any) (drivers.FileIterator, error) { + conf, err := parseSourceProperties(props) if err != nil { return nil, fmt.Errorf("failed to parse config: %w", err) } From b7fc59d1d1c5b2986eb884ed586418ffbbef2fd2 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 13 Sep 2023 21:34:15 +0300 Subject: [PATCH 22/40] athena-driver review --- runtime/drivers/athena/athena.go | 70 +++++-------------- .../transporter/objectStore_to_duckDB.go | 16 ++--- .../duckdb/transporter/sqlstore_to_duckDB.go | 5 +- 3 files changed, 28 insertions(+), 63 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 184ab6335a2..38bc998b817 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -2,7 +2,6 @@ package athena import ( "context" - "errors" "fmt" "strings" "time" @@ -15,10 +14,8 @@ import ( s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" s3v2types "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/eapache/go-resiliency/retrier" - "github.com/google/uuid" "github.com/mitchellh/mapstructure" "github.com/rilldata/rill/runtime/drivers" - rillblob "github.com/rilldata/rill/runtime/drivers/blob" "github.com/rilldata/rill/runtime/pkg/activity" "go.uber.org/zap" "gocloud.dev/blob" @@ -181,7 +178,7 @@ func (c *Connection) MigrationStatus(ctx context.Context) (current, desired int, // AsObjectStore implements drivers.Connection. func (c *Connection) AsObjectStore() (drivers.ObjectStore, bool) { - return c, true + return nil, false } // AsTransporter implements drivers.Connection. @@ -195,7 +192,7 @@ func (c *Connection) AsFileStore() (drivers.FileStore, bool) { // AsSQLStore implements drivers.Connection. func (c *Connection) AsSQLStore() (drivers.SQLStore, bool) { - return nil, false + return c, true } func cleanPath(ctx context.Context, cfg aws.Config, bucketName, prefix string) error { @@ -239,48 +236,6 @@ func (ci janitorIterator) Close() error { return cleanPath(ci.ctx, ci.cfg, ci.bucketName, ci.unloadPath) } -func (c *Connection) DownloadFiles(ctx context.Context, props map[string]any) (drivers.FileIterator, error) { - conf, err := parseSourceProperties(props) - if err != nil { - return nil, fmt.Errorf("failed to parse config: %w", err) - } - - cfg, err := c.newCfg(ctx) - if err != nil { - return nil, err - } - - unloadPath := "parquet_output_" + uuid.New().String() - bucketName := strings.Split(strings.TrimPrefix(conf.OutputLocation, "s3://"), "/")[0] - unloadLocation := strings.TrimRight(conf.OutputLocation, "/") + "/" + unloadPath - err = c.unload(ctx, cfg, conf, unloadLocation) - if err != nil { - return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, unloadPath)) - } - - bucketObj, err := c.openBucket(ctx, conf, bucketName) - if err != nil { - return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", unloadLocation, err), cleanPath(ctx, cfg, bucketName, unloadPath)) - } - - opts := rillblob.Options{ - GlobPattern: unloadPath + "/**", - } - - it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) - if err != nil { - return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, unloadPath)) - } - - return janitorIterator{ - FileIterator: it, - ctx: ctx, - unloadPath: unloadPath, - bucketName: bucketName, - cfg: cfg, - }, nil -} - func (c *Connection) newCfg(ctx context.Context) (aws.Config, error) { var cfg aws.Config var err error @@ -311,12 +266,25 @@ func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, buc return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) } -func (c *Connection) unload(ctx context.Context, cfg aws.Config, conf *sourceProperties, path string) error { - finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, path) - client := athena.NewFromConfig(cfg) +func resolveOutputLocation(ctx context.Context, client *athena.Client, conf *sourceProperties) (string, error) { + if conf.OutputLocation != "" { + return conf.OutputLocation, nil + } else { + wo, err := client.GetWorkGroup(ctx, &athena.GetWorkGroupInput{ + WorkGroup: aws.String(conf.WorkGroup), + }) + if err != nil { + return "", err + } + return *wo.WorkGroup.Configuration.ResultConfiguration.OutputLocation, nil + } +} + +func (c *Connection) unload(client *athena.Client, ctx context.Context, cfg aws.Config, conf *sourceProperties, unloadLocation string) error { + finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, unloadLocation) resultConfig := &types.ResultConfiguration{ - OutputLocation: aws.String(strings.TrimRight(conf.OutputLocation, "/") + "/output/"), + OutputLocation: aws.String(conf.OutputLocation), } executeParams := &athena.StartQueryExecutionInput{ diff --git a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go index e6b04a03b95..28a68c13d95 100644 --- a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go @@ -4,13 +4,11 @@ import ( "context" "errors" "fmt" - "reflect" "strings" "time" "github.com/c2h5oh/datasize" "github.com/rilldata/rill/runtime/drivers" - "github.com/rilldata/rill/runtime/drivers/athena" "github.com/rilldata/rill/runtime/pkg/duckdbsql" "github.com/rilldata/rill/runtime/pkg/fileutil" "github.com/rilldata/rill/runtime/pkg/observability" @@ -50,10 +48,9 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps return drivers.ErrIngestionLimitExceeded } - fromAthena := reflect.TypeOf(t.from).AssignableTo(reflect.TypeOf(&athena.Connection{})) sql, hasSQL := srcProps["sql"].(string) // if sql is specified use ast rewrite to fill in the downloaded files - if hasSQL && !fromAthena { + if hasSQL { return t.ingestDuckDBSQL(ctx, sql, iterator, sinkCfg, opts, p) } @@ -63,9 +60,6 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps val, formatDefined := srcProps["format"].(string) if formatDefined { format = fmt.Sprintf(".%s", val) - } else if fromAthena { - format = "parquet" - formatDefined = true } allowSchemaRelaxation, err := schemaRelaxationProperty(srcProps) @@ -108,13 +102,13 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps st := time.Now() t.logger.Info("ingesting files", zap.Strings("files", files), observability.ZapCtx(ctx)) if appendToTable { - if err := a.appendData(ctx, files, format, fromAthena); err != nil { + if err := a.appendData(ctx, files, format); err != nil { return err } } else { var from string var err error - from, err = sourceReader(files, format, ingestionProps, fromAthena) + from, err = sourceReader(files, format, ingestionProps, false) if err != nil { return err } @@ -155,8 +149,8 @@ func newAppender(to drivers.OLAPStore, sink *sinkProperties, ingestionProps map[ } } -func (a *appender) appendData(ctx context.Context, files []string, format string, fromAthena bool) error { - from, err := sourceReader(files, format, a.ingestionProps, fromAthena) +func (a *appender) appendData(ctx context.Context, files []string, format string) error { + from, err := sourceReader(files, format, a.ingestionProps, false) if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go index 4b5fc99356d..111be0bc720 100644 --- a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go @@ -3,9 +3,11 @@ package transporter import ( "context" "fmt" + "reflect" "time" "github.com/rilldata/rill/runtime/drivers" + "github.com/rilldata/rill/runtime/drivers/athena" "github.com/rilldata/rill/runtime/pkg/fileutil" "github.com/rilldata/rill/runtime/pkg/observability" "go.uber.org/zap" @@ -51,6 +53,7 @@ func (s *sqlStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps map // TODO :: iteration over fileiterator is similar(apart from no schema changes possible here) // to consuming fileIterator in objectStore_to_duckDB // both can be refactored to follow same path + fromAthena := reflect.TypeOf(s.from).AssignableTo(reflect.TypeOf(&athena.Connection{})) for iter.HasNext() { files, err := iter.NextBatch(opts.IteratorBatch) if err != nil { @@ -58,7 +61,7 @@ func (s *sqlStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps map } format := fileutil.FullExt(files[0]) - from, err := sourceReader(files, format, make(map[string]any), false) + from, err := sourceReader(files, format, make(map[string]any), fromAthena) if err != nil { return err } From 28606aa0952cad80ec5c2d18665ba49aa3676557 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 13 Sep 2023 21:35:50 +0300 Subject: [PATCH 23/40] athena-driver review --- runtime/drivers/athena/athena.go | 1 - runtime/drivers/athena/sql_store.go | 69 +++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 runtime/drivers/athena/sql_store.go diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 38bc998b817..d1fa0a5b726 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -3,7 +3,6 @@ package athena import ( "context" "fmt" - "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go new file mode 100644 index 00000000000..da6867aeac0 --- /dev/null +++ b/runtime/drivers/athena/sql_store.go @@ -0,0 +1,69 @@ +package athena + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/service/athena" + "github.com/google/uuid" + "github.com/rilldata/rill/runtime/drivers" + rillblob "github.com/rilldata/rill/runtime/drivers/blob" +) + +func (c *Connection) Query(ctx context.Context, props map[string]any) (drivers.RowIterator, error) { + return nil, fmt.Errorf("not implemented") +} + +func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, opt *drivers.QueryOption, p drivers.Progress) (drivers.FileIterator, error) { + conf, err := parseSourceProperties(props) + if err != nil { + return nil, fmt.Errorf("failed to parse config: %w", err) + } + + cfg, err := c.newCfg(ctx) + if err != nil { + return nil, err + } + + client := athena.NewFromConfig(cfg) + outputLocation, err := resolveOutputLocation(ctx, client, conf) + if err != nil { + return nil, err + } + + // outputLocation s3://bucket-name/prefix + // unloadLocation s3://bucket-name/prefix/rill-connector-parquet-output-<uuid> + // unloadPath prefix/rill-connector-parquet-output-<uuid> + unloadFolderName := "parquet_output_" + uuid.New().String() + bucketName := strings.Split(strings.TrimPrefix(outputLocation, "s3://"), "/")[0] + unloadLocation := strings.TrimRight(outputLocation, "/") + "/" + unloadFolderName + unloadPath := strings.TrimPrefix(strings.TrimPrefix(unloadLocation, "s3://"+bucketName), "/") + err = c.unload(client, ctx, cfg, conf, strings.TrimRight(outputLocation, "/")+unloadFolderName) + if err != nil { + return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, unloadPath)) + } + + bucketObj, err := c.openBucket(ctx, conf, bucketName) + if err != nil { + return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", bucketName, err), cleanPath(ctx, cfg, bucketName, unloadPath)) + } + + opts := rillblob.Options{ + GlobPattern: unloadPath + "/**", + } + + it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) + if err != nil { + return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, unloadPath)) + } + + return janitorIterator{ + FileIterator: it, + ctx: ctx, + unloadPath: unloadPath, + bucketName: bucketName, + cfg: cfg, + }, nil +} From c6af926dbceb6f90faf5e9287f4b5be8af96d5a2 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Wed, 13 Sep 2023 21:58:24 +0300 Subject: [PATCH 24/40] athena-driver review --- runtime/drivers/athena/athena.go | 12 ++++++++++-- runtime/drivers/athena/sql_store.go | 3 ++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index d1fa0a5b726..88fa431f7ab 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -2,6 +2,7 @@ package athena import ( "context" + "errors" "fmt" "time" @@ -204,6 +205,10 @@ func cleanPath(ctx context.Context, cfg aws.Config, bucketName, prefix string) e return err } + if len(out.Contents) > 1000 { // aws error is opaque here + return fmt.Errorf("too many objects to delete %d from %s", len(out.Contents), "s3://"+bucketName+"/"+prefix) + } + ids := make([]s3v2types.ObjectIdentifier, 0, len(out.Contents)) for _, o := range out.Contents { ids = append(ids, s3v2types.ObjectIdentifier{ @@ -211,6 +216,7 @@ func cleanPath(ctx context.Context, cfg aws.Config, bucketName, prefix string) e }) } _, err = s3client.DeleteObjects(ctx, &s3v2.DeleteObjectsInput{ + Bucket: &bucketName, Delete: &s3v2types.Delete{ Objects: ids, }, @@ -268,7 +274,7 @@ func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, buc func resolveOutputLocation(ctx context.Context, client *athena.Client, conf *sourceProperties) (string, error) { if conf.OutputLocation != "" { return conf.OutputLocation, nil - } else { + } else if conf.WorkGroup != "" { wo, err := client.GetWorkGroup(ctx, &athena.GetWorkGroupInput{ WorkGroup: aws.String(conf.WorkGroup), }) @@ -277,9 +283,11 @@ func resolveOutputLocation(ctx context.Context, client *athena.Client, conf *sou } return *wo.WorkGroup.Configuration.ResultConfiguration.OutputLocation, nil } + + return "", errors.New("Athena output location or Athena workgroup is required") } -func (c *Connection) unload(client *athena.Client, ctx context.Context, cfg aws.Config, conf *sourceProperties, unloadLocation string) error { +func (c *Connection) unload(ctx context.Context, client *athena.Client, cfg aws.Config, conf *sourceProperties, unloadLocation string) error { finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, unloadLocation) resultConfig := &types.ResultConfiguration{ diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index da6867aeac0..237dda9dc6e 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -33,6 +33,7 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, opt return nil, err } + // ie // outputLocation s3://bucket-name/prefix // unloadLocation s3://bucket-name/prefix/rill-connector-parquet-output-<uuid> // unloadPath prefix/rill-connector-parquet-output-<uuid> @@ -40,7 +41,7 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, opt bucketName := strings.Split(strings.TrimPrefix(outputLocation, "s3://"), "/")[0] unloadLocation := strings.TrimRight(outputLocation, "/") + "/" + unloadFolderName unloadPath := strings.TrimPrefix(strings.TrimPrefix(unloadLocation, "s3://"+bucketName), "/") - err = c.unload(client, ctx, cfg, conf, strings.TrimRight(outputLocation, "/")+unloadFolderName) + err = c.unload(ctx, client, cfg, conf, unloadLocation) if err != nil { return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, unloadPath)) } From 6d0476511e463b9d62d3324ceaff60808d58cb01 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Thu, 14 Sep 2023 14:14:04 +0300 Subject: [PATCH 25/40] Merge remote-tracking branch 'origin/main' into athena-connector --- .../transporter/objectStore_to_duckDB.go | 1 - runtime/drivers/duckdb/transporter/utils.go | 76 ------------------- 2 files changed, 77 deletions(-) diff --git a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go index 3530360d292..031e1b60e95 100644 --- a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go @@ -52,7 +52,6 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps return drivers.ErrIngestionLimitExceeded } - sql, hasSQL := srcProps["sql"].(string) // if sql is specified use ast rewrite to fill in the downloaded files if srcCfg.SQL != "" { return t.ingestDuckDBSQL(ctx, srcCfg.SQL, iterator, sinkCfg, opts, p) diff --git a/runtime/drivers/duckdb/transporter/utils.go b/runtime/drivers/duckdb/transporter/utils.go index e148ea94528..c7b56fdd1dc 100644 --- a/runtime/drivers/duckdb/transporter/utils.go +++ b/runtime/drivers/duckdb/transporter/utils.go @@ -98,82 +98,6 @@ func parseFileSourceProperties(props map[string]any) (*fileSourceProperties, err return cfg, nil } -type dbSourceProperties struct { - Database string `mapstructure:"db"` - SQL string `mapstructure:"sql"` -} - -func parseDBSourceProperties(props map[string]any) (*dbSourceProperties, error) { - cfg := &dbSourceProperties{} - if err := mapstructure.Decode(props, cfg); err != nil { - return nil, fmt.Errorf("failed to parse source properties: %w", err) - } - if cfg.SQL == "" { - return nil, fmt.Errorf("property 'sql' is mandatory") - } - return cfg, nil -} - -type fileSourceProperties struct { - SQL string `mapstructure:"sql"` - DuckDB map[string]any `mapstructure:"duckdb"` - Format string `mapstructure:"format"` - AllowSchemaRelaxation bool `mapstructure:"allow_schema_relaxation"` - BatchSize string `mapstructure:"batch_size"` - BatchSizeBytes int64 `mapstructure:"-"` // Inferred from BatchSize - - // Backwards compatibility - HivePartitioning *bool `mapstructure:"hive_partitioning"` - CSVDelimiter string `mapstructure:"csv.delimiter"` - IngestAllowSchemaRelaxation *bool `mapstructure:"ingest.allow_schema_relaxation"` -} - -func parseFileSourceProperties(props map[string]any) (*fileSourceProperties, error) { - cfg := &fileSourceProperties{} - if err := mapstructure.Decode(props, cfg); err != nil { - return nil, fmt.Errorf("failed to parse source properties: %w", err) - } - - if cfg.DuckDB == nil { - cfg.DuckDB = map[string]any{} - } - - if cfg.HivePartitioning != nil { - cfg.DuckDB["hive_partitioning"] = *cfg.HivePartitioning - cfg.HivePartitioning = nil - } - - if cfg.CSVDelimiter != "" { - cfg.DuckDB["delim"] = fmt.Sprintf("'%v'", cfg.CSVDelimiter) - cfg.CSVDelimiter = "" - } - - if cfg.IngestAllowSchemaRelaxation != nil { - cfg.AllowSchemaRelaxation = *cfg.IngestAllowSchemaRelaxation - cfg.IngestAllowSchemaRelaxation = nil - } - - if cfg.AllowSchemaRelaxation { - if val, ok := cfg.DuckDB["union_by_name"].(bool); ok && !val { - return nil, fmt.Errorf("can't set `union_by_name` and `allow_schema_relaxation` at the same time") - } - - if hasKey(cfg.DuckDB, "columns", "types", "dtypes") { - return nil, fmt.Errorf("if any of `columns`,`types`,`dtypes` is set `allow_schema_relaxation` must be disabled") - } - } - - if cfg.BatchSize != "" { - b, err := datasize.ParseString(cfg.BatchSize) - if err != nil { - return nil, err - } - cfg.BatchSizeBytes = int64(b.Bytes()) - } - - return cfg, nil -} - func sourceReader(paths []string, format string, ingestionProps map[string]any, fromAthena bool) (string, error) { // Generate a "read" statement if containsAny(format, []string{".csv", ".tsv", ".txt"}) { From 45efd9a02d88e6dd6b44d86f61cb8166febaf905 Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Thu, 14 Sep 2023 14:32:02 +0300 Subject: [PATCH 26/40] Merge remote-tracking branch 'origin/main' into athena-connector --- web-common/src/features/sources/modal/submitRemoteSourceForm.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts index 1eb5872b2eb..148e8f66596 100644 --- a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts +++ b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts @@ -63,7 +63,7 @@ export async function submitRemoteSourceForm( const formValues = Object.fromEntries( Object.entries(values).map(([key, value]) => { switch (key) { - case "project_id": + case "project_id": case "athena_output_location": case "athena_workgroup": return [key, value]; From b594ed4e8a2f8d16cc917882e97a1f19609d642d Mon Sep 17 00:00:00 2001 From: Egor Ryashin <egor.ryashin@rilldata.com> Date: Thu, 21 Sep 2023 13:01:44 +0300 Subject: [PATCH 27/40] athena-driver review --- runtime/drivers/athena/athena.go | 41 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 88fa431f7ab..c408ccca690 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -13,7 +13,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/athena/types" s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" s3v2types "github.com/aws/aws-sdk-go-v2/service/s3/types" - "github.com/eapache/go-resiliency/retrier" "github.com/mitchellh/mapstructure" "github.com/rilldata/rill/runtime/drivers" "github.com/rilldata/rill/runtime/pkg/activity" @@ -311,22 +310,28 @@ func (c *Connection) unload(ctx context.Context, client *athena.Client, cfg aws. return err } - r := retrier.New(retrier.ConstantBackoff(int(5*time.Minute/time.Second), time.Second), nil) // 5 minutes timeout - return r.RunCtx(ctx, func(ctx context.Context) error { - status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ - QueryExecutionId: athenaExecution.QueryExecutionId, - }) - if err != nil { - return err + tm := time.NewTimer(5 * time.Minute) + defer tm.Stop() + for { + select { + case <-tm.C: + fmt.Errorf("Athena ingestion timeout") + default: + status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ + QueryExecutionId: athenaExecution.QueryExecutionId, + }) + if err != nil { + return err + } + + state := status.QueryExecution.Status.State + + if state == types.QueryExecutionStateSucceeded || state == types.QueryExecutionStateCancelled { + return nil + } else if state == types.QueryExecutionStateFailed { + return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) + } } - - state := status.QueryExecution.Status.State - - if state == types.QueryExecutionStateSucceeded || state == types.QueryExecutionStateCancelled { - return nil - } else if state == types.QueryExecutionStateFailed { - return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) - } - return fmt.Errorf("Athena ingestion timeout") - }) + time.Sleep(time.Second) + } } From bf2f0449884e1cb834445b18bcd8c13ba4bdaa27 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Sat, 23 Sep 2023 11:10:37 +0300 Subject: [PATCH 28/40] Auto-determine AWS region cleanUp function Added AWS region and reordered functions Moved functions to sql_store Renaming and code refactoring --- runtime/connections.go | 2 +- runtime/drivers/athena/athena.go | 196 ++---------- runtime/drivers/athena/sql_store.go | 285 ++++++++++++++++-- .../duckdb/transporter/filestore_to_duckDB.go | 2 +- .../transporter/objectStore_to_duckDB.go | 4 +- .../duckdb/transporter/sqlstore_to_duckDB.go | 8 +- runtime/drivers/duckdb/transporter/utils.go | 4 +- .../catalog/artifacts/yaml/objects.go | 8 +- .../catalog/migrator/sources/sources.go | 6 +- .../sources/modal/submitRemoteSourceForm.ts | 4 +- .../src/features/sources/modal/yupSchemas.ts | 4 +- 11 files changed, 305 insertions(+), 218 deletions(-) diff --git a/runtime/connections.go b/runtime/connections.go index 4ca6ff5344a..9710a9fc48b 100644 --- a/runtime/connections.go +++ b/runtime/connections.go @@ -227,7 +227,7 @@ func (r *Runtime) connectorConfig(ctx context.Context, instanceID, name string) // For backwards compatibility, certain root-level variables apply to certain implicit connectors. // NOTE: This switches on connector.Name, not connector.Type, because this only applies to implicit connectors. switch connector.Name { - case "s3": + case "s3", "athena": setIfNil(cfg, "aws_access_key_id", vars["aws_access_key_id"]) setIfNil(cfg, "aws_secret_access_key", vars["aws_secret_access_key"]) setIfNil(cfg, "aws_session_token", vars["aws_session_token"]) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index c408ccca690..6aaeb92c108 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -2,23 +2,12 @@ package athena import ( "context" - "errors" "fmt" - "time" - "github.com/aws/aws-sdk-go-v2/aws" - awsconfig "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/credentials" - "github.com/aws/aws-sdk-go-v2/service/athena" - "github.com/aws/aws-sdk-go-v2/service/athena/types" - s3v2 "github.com/aws/aws-sdk-go-v2/service/s3" - s3v2types "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/mitchellh/mapstructure" "github.com/rilldata/rill/runtime/drivers" "github.com/rilldata/rill/runtime/pkg/activity" "go.uber.org/zap" - "gocloud.dev/blob" - "gocloud.dev/blob/s3blob" ) func init() { @@ -40,20 +29,27 @@ var spec = drivers.Spec{ Placeholder: "select * from catalog.table;", }, { - Key: "athena_output_location", + Key: "output_location", DisplayName: "S3 output location", - Description: "Oputut location for query results in S3.", + Description: "Output location for query results in S3.", Placeholder: "s3://bucket-name/path/", Type: drivers.StringPropertyType, - Required: true, + Required: false, }, { - Key: "athena_workgroup", + Key: "workgroup", DisplayName: "AWS Athena workgroup", Description: "AWS Athena workgroup to use for queries.", Type: drivers.StringPropertyType, Required: false, }, + { + Key: "region", + DisplayName: "AWS region", + Description: "AWS region to connect to Athena and the output location.", + Type: drivers.StringPropertyType, + Required: false, + }, }, ConfigProperties: []drivers.PropertySchema{ { @@ -69,14 +65,7 @@ var spec = drivers.Spec{ type driver struct{} -type configProperties struct { - AccessKeyID string `mapstructure:"aws_access_key_id"` - SecretAccessKey string `mapstructure:"aws_secret_access_key"` - SessionToken string `mapstructure:"aws_access_token"` - AllowHostAccess bool `mapstructure:"allow_host_access"` -} - -func (d driver) Open(config map[string]any, shared bool, client activity.Client, logger *zap.Logger) (drivers.Handle, error) { +func (d driver) Open(config map[string]any, shared bool, _ activity.Client, logger *zap.Logger) (drivers.Handle, error) { if shared { return nil, fmt.Errorf("athena driver can't be shared") } @@ -105,22 +94,6 @@ func (d driver) HasAnonymousSourceAccess(ctx context.Context, src map[string]any return false, nil } -type sourceProperties struct { - SQL string `mapstructure:"sql"` - OutputLocation string `mapstructure:"athena_output_location"` - WorkGroup string `mapstructure:"athena_workgroup"` -} - -func parseSourceProperties(props map[string]any) (*sourceProperties, error) { - conf := &sourceProperties{} - err := mapstructure.Decode(props, conf) - if err != nil { - return nil, err - } - - return conf, nil -} - type Connection struct { config *configProperties logger *zap.Logger @@ -194,144 +167,9 @@ func (c *Connection) AsSQLStore() (drivers.SQLStore, bool) { return c, true } -func cleanPath(ctx context.Context, cfg aws.Config, bucketName, prefix string) error { - s3client := s3v2.NewFromConfig(cfg) - out, err := s3client.ListObjectsV2(ctx, &s3v2.ListObjectsV2Input{ - Bucket: &bucketName, - Prefix: &prefix, - }) - if err != nil { - return err - } - - if len(out.Contents) > 1000 { // aws error is opaque here - return fmt.Errorf("too many objects to delete %d from %s", len(out.Contents), "s3://"+bucketName+"/"+prefix) - } - - ids := make([]s3v2types.ObjectIdentifier, 0, len(out.Contents)) - for _, o := range out.Contents { - ids = append(ids, s3v2types.ObjectIdentifier{ - Key: o.Key, - }) - } - _, err = s3client.DeleteObjects(ctx, &s3v2.DeleteObjectsInput{ - Bucket: &bucketName, - Delete: &s3v2types.Delete{ - Objects: ids, - }, - }) - return err -} - -type janitorIterator struct { - drivers.FileIterator - ctx context.Context - cfg aws.Config - bucketName string - unloadPath string -} - -func (ci janitorIterator) Close() error { - err := ci.FileIterator.Close() - if err != nil { - return err - } - - return cleanPath(ci.ctx, ci.cfg, ci.bucketName, ci.unloadPath) -} - -func (c *Connection) newCfg(ctx context.Context) (aws.Config, error) { - var cfg aws.Config - var err error - if c.config.AllowHostAccess { - cfg, err = awsconfig.LoadDefaultConfig( - ctx, - ) - } else { - cfg, err = awsconfig.LoadDefaultConfig( - ctx, - awsconfig.WithCredentialsProvider(credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken)), - ) - } - if err != nil { - return aws.Config{}, err - } - - return cfg, nil -} - -func (c *Connection) openBucket(ctx context.Context, conf *sourceProperties, bucket string) (*blob.Bucket, error) { - cfg, err := c.newCfg(ctx) - if err != nil { - return nil, err - } - - s3client := s3v2.NewFromConfig(cfg) - return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) -} - -func resolveOutputLocation(ctx context.Context, client *athena.Client, conf *sourceProperties) (string, error) { - if conf.OutputLocation != "" { - return conf.OutputLocation, nil - } else if conf.WorkGroup != "" { - wo, err := client.GetWorkGroup(ctx, &athena.GetWorkGroupInput{ - WorkGroup: aws.String(conf.WorkGroup), - }) - if err != nil { - return "", err - } - return *wo.WorkGroup.Configuration.ResultConfiguration.OutputLocation, nil - } - - return "", errors.New("Athena output location or Athena workgroup is required") -} - -func (c *Connection) unload(ctx context.Context, client *athena.Client, cfg aws.Config, conf *sourceProperties, unloadLocation string) error { - finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, unloadLocation) - - resultConfig := &types.ResultConfiguration{ - OutputLocation: aws.String(conf.OutputLocation), - } - - executeParams := &athena.StartQueryExecutionInput{ - QueryString: aws.String(finalSQL), - ResultConfiguration: resultConfig, - } - - if conf.WorkGroup != "" { - executeParams = &athena.StartQueryExecutionInput{ - QueryString: aws.String(finalSQL), - WorkGroup: aws.String(conf.WorkGroup), - } - } - - athenaExecution, err := client.StartQueryExecution(ctx, executeParams) - if err != nil { - return err - } - - tm := time.NewTimer(5 * time.Minute) - defer tm.Stop() - for { - select { - case <-tm.C: - fmt.Errorf("Athena ingestion timeout") - default: - status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ - QueryExecutionId: athenaExecution.QueryExecutionId, - }) - if err != nil { - return err - } - - state := status.QueryExecution.Status.State - - if state == types.QueryExecutionStateSucceeded || state == types.QueryExecutionStateCancelled { - return nil - } else if state == types.QueryExecutionStateFailed { - return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) - } - } - time.Sleep(time.Second) - } +type configProperties struct { + AccessKeyID string `mapstructure:"aws_access_key_id"` + SecretAccessKey string `mapstructure:"aws_secret_access_key"` + SessionToken string `mapstructure:"aws_access_token"` + AllowHostAccess bool `mapstructure:"allow_host_access"` } diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index 237dda9dc6e..8b3722cb351 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -4,51 +4,97 @@ import ( "context" "errors" "fmt" + "net/url" "strings" + "time" + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials" "github.com/aws/aws-sdk-go-v2/service/athena" + types2 "github.com/aws/aws-sdk-go-v2/service/athena/types" + "github.com/aws/aws-sdk-go-v2/service/s3" + "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/google/uuid" + "github.com/mitchellh/mapstructure" "github.com/rilldata/rill/runtime/drivers" rillblob "github.com/rilldata/rill/runtime/drivers/blob" + "gocloud.dev/blob" + "gocloud.dev/blob/s3blob" ) -func (c *Connection) Query(ctx context.Context, props map[string]any) (drivers.RowIterator, error) { +func (c *Connection) Query(_ context.Context, _ map[string]any) (drivers.RowIterator, error) { return nil, fmt.Errorf("not implemented") } -func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, opt *drivers.QueryOption, p drivers.Progress) (drivers.FileIterator, error) { +func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ *drivers.QueryOption, _ drivers.Progress) (drivers.FileIterator, error) { conf, err := parseSourceProperties(props) if err != nil { return nil, fmt.Errorf("failed to parse config: %w", err) } - cfg, err := c.newCfg(ctx) + // Determine AWS region if it is not specified + determineAWSRegion := conf.AWSRegion == "" + + awsRegion := conf.AWSRegion + if determineAWSRegion { + // AWS region is not specified, use 'us-east-1' for now + awsRegion = "us-east-1" + } + + awsConfig, err := c.awsConfig(ctx, awsRegion) if err != nil { return nil, err } - client := athena.NewFromConfig(cfg) + client := athena.NewFromConfig(awsConfig) outputLocation, err := resolveOutputLocation(ctx, client, conf) if err != nil { return nil, err } - // ie - // outputLocation s3://bucket-name/prefix - // unloadLocation s3://bucket-name/prefix/rill-connector-parquet-output-<uuid> - // unloadPath prefix/rill-connector-parquet-output-<uuid> - unloadFolderName := "parquet_output_" + uuid.New().String() - bucketName := strings.Split(strings.TrimPrefix(outputLocation, "s3://"), "/")[0] - unloadLocation := strings.TrimRight(outputLocation, "/") + "/" + unloadFolderName - unloadPath := strings.TrimPrefix(strings.TrimPrefix(unloadLocation, "s3://"+bucketName), "/") - err = c.unload(ctx, client, cfg, conf, unloadLocation) + outputURL, err := url.Parse(outputLocation) + if err != nil { + return nil, err + } + + // outputLocation s3://bucket/path + // unloadLocation s3://bucket/path/rill_tmp_<uuid> + // unloadPath path/rill_tmp_<uuid> + unloadFolderName := "rill_tmp_" + uuid.New().String() + bucketName := outputURL.Hostname() + unloadURL := outputURL.JoinPath(unloadFolderName) + unloadLocation := unloadURL.String() + unloadPath := strings.TrimPrefix(unloadURL.Path, "/") + + // Determine actual AWS region and update the config if needed + if determineAWSRegion { + actualRegion, err := getActualAWSRegion(ctx, awsConfig, bucketName) + if err != nil { + return nil, err + } + + if awsRegion != actualRegion { + awsRegion = actualRegion + awsConfig, err = c.awsConfig(ctx, awsRegion) + if err != nil { + return nil, err + } + } + } + + cleanUp := func() error { + return deleteObjectsInPrefix(ctx, awsConfig, bucketName, unloadPath) + } + + err = c.unload(ctx, client, conf, unloadLocation) if err != nil { - return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanPath(ctx, cfg, bucketName, unloadPath)) + return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanUp()) } - bucketObj, err := c.openBucket(ctx, conf, bucketName) + bucketObj, err := openBucket(ctx, awsConfig, bucketName) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", bucketName, err), cleanPath(ctx, cfg, bucketName, unloadPath)) + return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", bucketName, err), cleanUp()) } opts := rillblob.Options{ @@ -57,14 +103,215 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, opt it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanPath(ctx, cfg, bucketName, unloadPath)) + return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanUp()) } - return janitorIterator{ + return autoDeleteFileIterator{ FileIterator: it, ctx: ctx, unloadPath: unloadPath, bucketName: bucketName, - cfg: cfg, + cfg: awsConfig, }, nil } + +func (c *Connection) awsConfig(ctx context.Context, awsRegion string) (aws.Config, error) { + loadOptions := make([]func(*config.LoadOptions) error, 0) + + if awsRegion != "" { + loadOptions = append(loadOptions, config.WithDefaultRegion(awsRegion)) + } + + // If one of the static properties is specified: access key, secret key, or session token, use static credentials, + // Else fallback to the SDK's default credential chain (environment, instance, etc) unless AllowHostAccess is false + if c.config.AccessKeyID != "" || c.config.SecretAccessKey != "" || c.config.SessionToken != "" { + p := credentials.NewStaticCredentialsProvider(c.config.AccessKeyID, c.config.SecretAccessKey, c.config.SessionToken) + loadOptions = append(loadOptions, config.WithCredentialsProvider(p)) + } else if !c.config.AllowHostAccess { + return aws.Config{}, fmt.Errorf("static creds are not provided, and host access is not allowed") + } + + return config.LoadDefaultConfig(ctx, loadOptions...) +} + +func (c *Connection) unload(ctx context.Context, client *athena.Client, conf *sourceProperties, unloadLocation string) error { + finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, unloadLocation) + + executeParams := &athena.StartQueryExecutionInput{ + QueryString: aws.String(finalSQL), + } + + // If output_location is set, use it and don't set workgroup because the workgroup can override the output location + // Otherwise use specified workgroup or the "primary" workgroup + // see https://docs.aws.amazon.com/athena/latest/ug/querying.html + if conf.OutputLocation != "" { + executeParams.ResultConfiguration = &types2.ResultConfiguration{ + OutputLocation: aws.String(conf.OutputLocation), + } + } else { + workgroup := conf.Workgroup + if workgroup == "" { + // fallback to "primary" (default) workgroup if no workgroup is specified + workgroup = "primary" + } + executeParams.WorkGroup = aws.String(workgroup) + } + + queryExecutionOutput, err := client.StartQueryExecution(ctx, executeParams) + if err != nil { + return err + } + + tm := time.NewTimer(5 * time.Minute) + defer tm.Stop() + for { + select { + case <-tm.C: + return fmt.Errorf("Athena ingestion timed out") + default: + status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ + QueryExecutionId: queryExecutionOutput.QueryExecutionId, + }) + if err != nil { + return err + } + + switch status.QueryExecution.Status.State { + case types2.QueryExecutionStateSucceeded, types2.QueryExecutionStateCancelled: + return nil + case types2.QueryExecutionStateFailed: + return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) + } + } + time.Sleep(time.Second) + } +} + +func parseSourceProperties(props map[string]any) (*sourceProperties, error) { + conf := &sourceProperties{} + err := mapstructure.Decode(props, conf) + if err != nil { + return nil, err + } + + return conf, nil +} + +func resolveOutputLocation(ctx context.Context, client *athena.Client, conf *sourceProperties) (string, error) { + if conf.OutputLocation != "" { + return conf.OutputLocation, nil + } + + workgroup := conf.Workgroup + // fallback to "primary" (default) workgroup if no workgroup is specified + if workgroup == "" { + workgroup = "primary" + } + + wo, err := client.GetWorkGroup(ctx, &athena.GetWorkGroupInput{ + WorkGroup: aws.String(workgroup), + }) + if err != nil { + return "", err + } + + resultConfiguration := wo.WorkGroup.Configuration.ResultConfiguration + if resultConfiguration != nil && resultConfiguration.OutputLocation != nil && *resultConfiguration.OutputLocation != "" { + return *resultConfiguration.OutputLocation, nil + } + + return "", fmt.Errorf("either output_location or workgroup with an output location must be set") +} + +func getActualAWSRegion(ctx context.Context, awsConfig aws.Config, bucketName string) (string, error) { + s3client := s3.NewFromConfig(awsConfig) + + resp, err := s3client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ + Bucket: &bucketName, + }) + if err != nil { + return "", err + } + + actualRegion := string(resp.LocationConstraint) + if actualRegion == "" { // For US East (N. Virginia) region + actualRegion = "us-east-1" + } + return actualRegion, nil +} + +func openBucket(ctx context.Context, cfg aws.Config, bucket string) (*blob.Bucket, error) { + s3client := s3.NewFromConfig(cfg) + return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) +} + +func deleteObjectsInPrefix(ctx context.Context, cfg aws.Config, bucketName, prefix string) error { + s3client := s3.NewFromConfig(cfg) + + deleteBatch := func(objects []types.ObjectIdentifier) error { + _, err := s3client.DeleteObjects(ctx, &s3.DeleteObjectsInput{ + Bucket: &bucketName, + Delete: &types.Delete{ + Objects: objects, + }, + }) + return err + } + + var continuationToken *string + for { + out, err := s3client.ListObjectsV2(ctx, &s3.ListObjectsV2Input{ + Bucket: &bucketName, + Prefix: &prefix, + ContinuationToken: continuationToken, + }) + if err != nil { + return err + } + + ids := make([]types.ObjectIdentifier, 0, len(out.Contents)) + for _, o := range out.Contents { + ids = append(ids, types.ObjectIdentifier{ + Key: o.Key, + }) + } + + if len(ids) > 0 { + if err := deleteBatch(ids); err != nil { + return err + } + } + + if out.IsTruncated { + continuationToken = out.NextContinuationToken + } else { + break + } + } + + return nil +} + +type sourceProperties struct { + SQL string `mapstructure:"sql"` + OutputLocation string `mapstructure:"output_location"` + Workgroup string `mapstructure:"workgroup"` + AWSRegion string `mapstructure:"region"` +} + +type autoDeleteFileIterator struct { + drivers.FileIterator + ctx context.Context + cfg aws.Config + bucketName string + unloadPath string +} + +func (ci autoDeleteFileIterator) Close() error { + err := ci.FileIterator.Close() + if err != nil { + return err + } + + return deleteObjectsInPrefix(ci.ctx, ci.cfg, ci.bucketName, ci.unloadPath) +} diff --git a/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go b/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go index 073a5cb5a42..813c8cfd355 100644 --- a/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/filestore_to_duckDB.go @@ -59,7 +59,7 @@ func (t *fileStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps ma } // Ingest data - from, err := sourceReader(localPaths, format, srcCfg.DuckDB, false) + from, err := sourceReader(localPaths, format, srcCfg.DuckDB) if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go index f5ebbd5c4df..d2121bd64ed 100644 --- a/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/objectStore_to_duckDB.go @@ -96,7 +96,7 @@ func (t *objectStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps return err } } else { - from, err := sourceReader(files, format, srcCfg.DuckDB, false) + from, err := sourceReader(files, format, srcCfg.DuckDB) if err != nil { return err } @@ -136,7 +136,7 @@ func newAppender(to drivers.OLAPStore, sink *sinkProperties, ingestionProps map[ } func (a *appender) appendData(ctx context.Context, files []string, format string) error { - from, err := sourceReader(files, format, a.ingestionProps, false) + from, err := sourceReader(files, format, a.ingestionProps) if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go index b53cbf26def..58498509ee5 100644 --- a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go @@ -69,7 +69,13 @@ func (s *sqlStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps map } format := fileutil.FullExt(files[0]) - from, err := sourceReader(files, format, make(map[string]any), fromAthena) + if fromAthena { + // Athena doesn't specify ".parquet" extension in output file names + // Append ".parquet" extension to the extension generated by Athena + format += ".parquet" + } + + from, err := sourceReader(files, format, make(map[string]any)) if err != nil { return err } diff --git a/runtime/drivers/duckdb/transporter/utils.go b/runtime/drivers/duckdb/transporter/utils.go index c7b56fdd1dc..51c9acf73c0 100644 --- a/runtime/drivers/duckdb/transporter/utils.go +++ b/runtime/drivers/duckdb/transporter/utils.go @@ -98,12 +98,12 @@ func parseFileSourceProperties(props map[string]any) (*fileSourceProperties, err return cfg, nil } -func sourceReader(paths []string, format string, ingestionProps map[string]any, fromAthena bool) (string, error) { +func sourceReader(paths []string, format string, ingestionProps map[string]any) (string, error) { // Generate a "read" statement if containsAny(format, []string{".csv", ".tsv", ".txt"}) { // CSV reader return generateReadCsvStatement(paths, ingestionProps) - } else if strings.Contains(format, ".parquet") || fromAthena { + } else if strings.Contains(format, ".parquet") { // Parquet reader return generateReadParquetStatement(paths, ingestionProps) } else if containsAny(format, []string{".json", ".ndjson"}) { diff --git a/runtime/services/catalog/artifacts/yaml/objects.go b/runtime/services/catalog/artifacts/yaml/objects.go index 7078a118539..eb795c2a199 100644 --- a/runtime/services/catalog/artifacts/yaml/objects.go +++ b/runtime/services/catalog/artifacts/yaml/objects.go @@ -45,8 +45,8 @@ type Source struct { SQL string `yaml:"sql,omitempty" mapstructure:"sql,omitempty"` DB string `yaml:"db,omitempty" mapstructure:"db,omitempty"` ProjectID string `yaml:"project_id,omitempty" mapstructure:"project_id,omitempty"` - AthenaOutputLocation string `yaml:"athena_output_location,omitempty" mapstructure:"athena_output_location,omitempty"` - AthenaWorkgroup string `yaml:"athena_workgroup,omitempty" mapstructure:"athena_workgroup,omitempty"` + AthenaOutputLocation string `yaml:"output_location,omitempty" mapstructure:"output_location,omitempty"` + AthenaWorkgroup string `yaml:"workgroup,omitempty" mapstructure:"workgroup,omitempty"` } type MetricsView struct { @@ -210,11 +210,11 @@ func fromSourceArtifact(source *Source, path string) (*drivers.CatalogEntry, err } if source.AthenaOutputLocation != "" { - props["athena_output_location"] = source.AthenaOutputLocation + props["output_location"] = source.AthenaOutputLocation } if source.AthenaWorkgroup != "" { - props["athena_workgroup"] = source.AthenaWorkgroup + props["workgroup"] = source.AthenaWorkgroup } propsPB, err := structpb.NewStruct(props) diff --git a/runtime/services/catalog/migrator/sources/sources.go b/runtime/services/catalog/migrator/sources/sources.go index 570aa809931..c805b2c3374 100644 --- a/runtime/services/catalog/migrator/sources/sources.go +++ b/runtime/services/catalog/migrator/sources/sources.go @@ -376,11 +376,7 @@ func connectorVariables(src *runtimev1.Source, env map[string]string, repoRoot s "allow_host_access": strings.EqualFold(env["allow_host_access"], "true"), } switch connector { - case "s3": - vars["aws_access_key_id"] = env["aws_access_key_id"] - vars["aws_secret_access_key"] = env["aws_secret_access_key"] - vars["aws_session_token"] = env["aws_session_token"] - case "athena": + case "s3", "athena": vars["aws_access_key_id"] = env["aws_access_key_id"] vars["aws_secret_access_key"] = env["aws_secret_access_key"] vars["aws_session_token"] = env["aws_session_token"] diff --git a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts index 148e8f66596..2fac24dca9b 100644 --- a/web-common/src/features/sources/modal/submitRemoteSourceForm.ts +++ b/web-common/src/features/sources/modal/submitRemoteSourceForm.ts @@ -64,8 +64,8 @@ export async function submitRemoteSourceForm( Object.entries(values).map(([key, value]) => { switch (key) { case "project_id": - case "athena_output_location": - case "athena_workgroup": + case "output_location": + case "workgroup": return [key, value]; default: return [fromYupFriendlyKey(key), value]; diff --git a/web-common/src/features/sources/modal/yupSchemas.ts b/web-common/src/features/sources/modal/yupSchemas.ts index 9474266edba..4dd2fc46280 100644 --- a/web-common/src/features/sources/modal/yupSchemas.ts +++ b/web-common/src/features/sources/modal/yupSchemas.ts @@ -79,8 +79,8 @@ export function getYupSchema(connector: V1ConnectorSpec) { "Source name must start with a letter or underscore and contain only letters, numbers, and underscores" ) .required("Source name is required"), - athena_output_location: yup.string().required(), - athena_workgroup: yup.string(), + output_location: yup.string(), + workgroup: yup.string(), }); default: From 28f3e6ec5116797e882ac64a236f9baf3c6b5396 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Sun, 24 Sep 2023 00:05:56 +0300 Subject: [PATCH 29/40] Athena icon --- web-common/src/features/sources/modal/AddSourceModal.svelte | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web-common/src/features/sources/modal/AddSourceModal.svelte b/web-common/src/features/sources/modal/AddSourceModal.svelte index 03e9ef0a3a3..308ecb7a59a 100644 --- a/web-common/src/features/sources/modal/AddSourceModal.svelte +++ b/web-common/src/features/sources/modal/AddSourceModal.svelte @@ -22,6 +22,7 @@ import LocalSourceUpload from "./LocalSourceUpload.svelte"; import RemoteSourceForm from "./RemoteSourceForm.svelte"; import RequestConnectorForm from "./RequestConnectorForm.svelte"; + import AmazonAthena from "@rilldata/web-common/components/icons/connectors/AmazonAthena.svelte"; export let open: boolean; @@ -48,7 +49,7 @@ // azure_blob_storage: MicrosoftAzureBlobStorage, // duckdb: DuckDB, bigquery: GoogleBigQuery, - // athena: AmazonAthena, + athena: AmazonAthena, motherduck: MotherDuck, // postgres: Postgres, local_file: LocalFile, From 18b423dfc0aabe252e0433523334e83463707d7c Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 16:27:26 +0300 Subject: [PATCH 30/40] Removed the auto-resolving of AWS region --- runtime/drivers/athena/athena.go | 2 ++ runtime/drivers/athena/sql_store.go | 53 ++++------------------------- 2 files changed, 8 insertions(+), 47 deletions(-) diff --git a/runtime/drivers/athena/athena.go b/runtime/drivers/athena/athena.go index 6aaeb92c108..76a7b470c59 100644 --- a/runtime/drivers/athena/athena.go +++ b/runtime/drivers/athena/athena.go @@ -40,6 +40,7 @@ var spec = drivers.Spec{ Key: "workgroup", DisplayName: "AWS Athena workgroup", Description: "AWS Athena workgroup to use for queries.", + Placeholder: "primary", Type: drivers.StringPropertyType, Required: false, }, @@ -47,6 +48,7 @@ var spec = drivers.Spec{ Key: "region", DisplayName: "AWS region", Description: "AWS region to connect to Athena and the output location.", + Placeholder: "us-east-1", Type: drivers.StringPropertyType, Required: false, }, diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index 8b3722cb351..abbb26c4257 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -33,16 +33,7 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * return nil, fmt.Errorf("failed to parse config: %w", err) } - // Determine AWS region if it is not specified - determineAWSRegion := conf.AWSRegion == "" - - awsRegion := conf.AWSRegion - if determineAWSRegion { - // AWS region is not specified, use 'us-east-1' for now - awsRegion = "us-east-1" - } - - awsConfig, err := c.awsConfig(ctx, awsRegion) + awsConfig, err := c.awsConfig(ctx, conf.AWSRegion) if err != nil { return nil, err } @@ -67,22 +58,6 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * unloadLocation := unloadURL.String() unloadPath := strings.TrimPrefix(unloadURL.Path, "/") - // Determine actual AWS region and update the config if needed - if determineAWSRegion { - actualRegion, err := getActualAWSRegion(ctx, awsConfig, bucketName) - if err != nil { - return nil, err - } - - if awsRegion != actualRegion { - awsRegion = actualRegion - awsConfig, err = c.awsConfig(ctx, awsRegion) - if err != nil { - return nil, err - } - } - } - cleanUp := func() error { return deleteObjectsInPrefix(ctx, awsConfig, bucketName, unloadPath) } @@ -116,10 +91,11 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * } func (c *Connection) awsConfig(ctx context.Context, awsRegion string) (aws.Config, error) { - loadOptions := make([]func(*config.LoadOptions) error, 0) - - if awsRegion != "" { - loadOptions = append(loadOptions, config.WithDefaultRegion(awsRegion)) + loadOptions := []func(*config.LoadOptions) error{ + // Setting the default region to an empty string, will result in the default region value being ignored + config.WithDefaultRegion("us-east-1"), + // Setting the region to an empty string, will result in the region value being ignored + config.WithRegion(awsRegion), } // If one of the static properties is specified: access key, secret key, or session token, use static credentials, @@ -223,23 +199,6 @@ func resolveOutputLocation(ctx context.Context, client *athena.Client, conf *sou return "", fmt.Errorf("either output_location or workgroup with an output location must be set") } -func getActualAWSRegion(ctx context.Context, awsConfig aws.Config, bucketName string) (string, error) { - s3client := s3.NewFromConfig(awsConfig) - - resp, err := s3client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ - Bucket: &bucketName, - }) - if err != nil { - return "", err - } - - actualRegion := string(resp.LocationConstraint) - if actualRegion == "" { // For US East (N. Virginia) region - actualRegion = "us-east-1" - } - return actualRegion, nil -} - func openBucket(ctx context.Context, cfg aws.Config, bucket string) (*blob.Bucket, error) { s3client := s3.NewFromConfig(cfg) return s3blob.OpenBucketV2(ctx, s3client, bucket, nil) From 679e85dfe53205845399a4b1adad11d3ac371a83 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 16:39:06 +0300 Subject: [PATCH 31/40] Simplified a clean-up process --- runtime/drivers/athena/sql_store.go | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index abbb26c4257..175a5777bed 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -58,18 +58,18 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * unloadLocation := unloadURL.String() unloadPath := strings.TrimPrefix(unloadURL.Path, "/") - cleanUp := func() error { + cleanupFn := func() error { return deleteObjectsInPrefix(ctx, awsConfig, bucketName, unloadPath) } err = c.unload(ctx, client, conf, unloadLocation) if err != nil { - return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanUp()) + return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanupFn()) } bucketObj, err := openBucket(ctx, awsConfig, bucketName) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", bucketName, err), cleanUp()) + return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", bucketName, err), cleanupFn()) } opts := rillblob.Options{ @@ -78,15 +78,12 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanUp()) + return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanupFn()) } return autoDeleteFileIterator{ FileIterator: it, - ctx: ctx, - unloadPath: unloadPath, - bucketName: bucketName, - cfg: awsConfig, + cleanupFn: cleanupFn, }, nil } @@ -260,17 +257,14 @@ type sourceProperties struct { type autoDeleteFileIterator struct { drivers.FileIterator - ctx context.Context - cfg aws.Config - bucketName string - unloadPath string + cleanupFn func() error } -func (ci autoDeleteFileIterator) Close() error { - err := ci.FileIterator.Close() +func (i autoDeleteFileIterator) Close() error { + err := i.FileIterator.Close() if err != nil { return err } - return deleteObjectsInPrefix(ci.ctx, ci.cfg, ci.bucketName, ci.unloadPath) + return i.cleanupFn() } From 3ab245d009fceb20e7d0f4e7ed757fedb175eb36 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 18:10:06 +0300 Subject: [PATCH 32/40] Updated according to previously merged changes --- runtime/drivers/athena/sql_store.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index 175a5777bed..fcc6cafcbc0 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -24,7 +24,7 @@ import ( ) func (c *Connection) Query(_ context.Context, _ map[string]any) (drivers.RowIterator, error) { - return nil, fmt.Errorf("not implemented") + return nil, drivers.ErrNotImplemented } func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ *drivers.QueryOption, _ drivers.Progress) (drivers.FileIterator, error) { From 804de57c40b68f82ee8bd2c139e7f4d788f4ff95 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 22:07:57 +0300 Subject: [PATCH 33/40] Dash vs underscore --- runtime/drivers/athena/sql_store.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index fcc6cafcbc0..4ef2d2228c1 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -50,9 +50,9 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * } // outputLocation s3://bucket/path - // unloadLocation s3://bucket/path/rill_tmp_<uuid> - // unloadPath path/rill_tmp_<uuid> - unloadFolderName := "rill_tmp_" + uuid.New().String() + // unloadLocation s3://bucket/path/rill-tmp-<uuid> + // unloadPath path/rill-tmp-<uuid> + unloadFolderName := "rill-tmp-" + uuid.New().String() bucketName := outputURL.Hostname() unloadURL := outputURL.JoinPath(unloadFolderName) unloadLocation := unloadURL.String() From 591a4efdca5ee0cf48f5b03ec4fd58c0d178121d Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 22:08:33 +0300 Subject: [PATCH 34/40] A new line after a query --- runtime/drivers/athena/sql_store.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index 4ef2d2228c1..c05d0b4964a 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -108,7 +108,7 @@ func (c *Connection) awsConfig(ctx context.Context, awsRegion string) (aws.Confi } func (c *Connection) unload(ctx context.Context, client *athena.Client, conf *sourceProperties, unloadLocation string) error { - finalSQL := fmt.Sprintf("UNLOAD (%s) TO '%s' WITH (format = 'PARQUET')", conf.SQL, unloadLocation) + finalSQL := fmt.Sprintf("UNLOAD (%s\n) TO '%s' WITH (format = 'PARQUET')", conf.SQL, unloadLocation) executeParams := &athena.StartQueryExecutionInput{ QueryString: aws.String(finalSQL), From bc7f95d3cd5da77f981e02c74c6c0331cf425c0f Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 22:11:33 +0300 Subject: [PATCH 35/40] Non-nil NextContinuationToken --- runtime/drivers/athena/sql_store.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index c05d0b4964a..b06d7fbbd03 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -238,7 +238,7 @@ func deleteObjectsInPrefix(ctx context.Context, cfg aws.Config, bucketName, pref } } - if out.IsTruncated { + if out.IsTruncated && out.NextContinuationToken != nil { continuationToken = out.NextContinuationToken } else { break From af69414ee185e4925752511bd28158224075cf32 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 22:22:59 +0300 Subject: [PATCH 36/40] ctx cancellation instead of a hardcoded timer --- runtime/drivers/athena/sql_store.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index b06d7fbbd03..a4b4f3b0259 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -135,12 +135,13 @@ func (c *Connection) unload(ctx context.Context, client *athena.Client, conf *so return err } - tm := time.NewTimer(5 * time.Minute) - defer tm.Stop() for { select { - case <-tm.C: - return fmt.Errorf("Athena ingestion timed out") + case <-ctx.Done(): + _, err = client.StopQueryExecution(ctx, &athena.StopQueryExecutionInput{ + QueryExecutionId: queryExecutionOutput.QueryExecutionId, + }) + return errors.Join(ctx.Err(), err) default: status, err := client.GetQueryExecution(ctx, &athena.GetQueryExecutionInput{ QueryExecutionId: queryExecutionOutput.QueryExecutionId, @@ -150,8 +151,10 @@ func (c *Connection) unload(ctx context.Context, client *athena.Client, conf *so } switch status.QueryExecution.Status.State { - case types2.QueryExecutionStateSucceeded, types2.QueryExecutionStateCancelled: + case types2.QueryExecutionStateSucceeded: return nil + case types2.QueryExecutionStateCancelled: + return fmt.Errorf("Athena query execution cancelled") case types2.QueryExecutionStateFailed: return fmt.Errorf("Athena query execution failed %s", *status.QueryExecution.Status.AthenaError.ErrorMessage) } From 0bd4d25e2c3a3cb563a7d5cd7cf62fff0b9e7ada Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 22:52:52 +0300 Subject: [PATCH 37/40] Format for FileIterator --- runtime/drivers/athena/sql_store.go | 1 + runtime/drivers/bigquery/sql_store.go | 4 ++++ runtime/drivers/blob/blobdownloader.go | 6 ++++++ runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go | 9 ++------- runtime/drivers/duckdb/transporter/transporter_test.go | 4 ++++ runtime/drivers/object_store.go | 3 +++ 6 files changed, 20 insertions(+), 7 deletions(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index a4b4f3b0259..968dcaefcc1 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -74,6 +74,7 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * opts := rillblob.Options{ GlobPattern: unloadPath + "/**", + Format: "parquet", } it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) diff --git a/runtime/drivers/bigquery/sql_store.go b/runtime/drivers/bigquery/sql_store.go index e5b507e3ab1..40d89dd39a3 100644 --- a/runtime/drivers/bigquery/sql_store.go +++ b/runtime/drivers/bigquery/sql_store.go @@ -224,6 +224,10 @@ func (f *fileIterator) Size(unit drivers.ProgressUnit) (int64, bool) { } } +func (f *fileIterator) Format() string { + return "" +} + func (f *fileIterator) downloadAsJSONFile() error { tf := time.Now() defer func() { diff --git a/runtime/drivers/blob/blobdownloader.go b/runtime/drivers/blob/blobdownloader.go index 405ea07fa55..b1e82ffd7d9 100644 --- a/runtime/drivers/blob/blobdownloader.go +++ b/runtime/drivers/blob/blobdownloader.go @@ -65,6 +65,8 @@ type Options struct { StorageLimitInBytes int64 // Retain files and only delete during close KeepFilesUntilClose bool + // General blob format (json, csv, parquet, etc) + Format string } // sets defaults if not set by user @@ -373,6 +375,10 @@ func (it *blobIterator) KeepFilesUntilClose(keepFilesUntilClose bool) { it.opts.KeepFilesUntilClose = keepFilesUntilClose } +func (it *blobIterator) Format() string { + return it.opts.Format +} + // todo :: ideally planner should take ownership of the bucket and return an iterator with next returning objectWithPlan func (it *blobIterator) plan() ([]*objectWithPlan, error) { var ( diff --git a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go index a9e71491e40..48720820478 100644 --- a/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go +++ b/runtime/drivers/duckdb/transporter/sqlstore_to_duckDB.go @@ -7,13 +7,11 @@ import ( "errors" "fmt" "math" - "reflect" "time" "github.com/marcboeker/go-duckdb" runtimev1 "github.com/rilldata/rill/proto/gen/rill/runtime/v1" "github.com/rilldata/rill/runtime/drivers" - "github.com/rilldata/rill/runtime/drivers/athena" "github.com/rilldata/rill/runtime/pkg/fileutil" "github.com/rilldata/rill/runtime/pkg/observability" "go.uber.org/zap" @@ -78,7 +76,6 @@ func (s *sqlStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps map // TODO :: iteration over fileiterator is similar(apart from no schema changes possible here) // to consuming fileIterator in objectStore_to_duckDB // both can be refactored to follow same path - fromAthena := reflect.TypeOf(s.from).AssignableTo(reflect.TypeOf(&athena.Connection{})) for iter.HasNext() { files, err := iter.NextBatch(_sqlStoreIteratorBatchSize) if err != nil { @@ -86,10 +83,8 @@ func (s *sqlStoreToDuckDB) Transfer(ctx context.Context, srcProps, sinkProps map } format := fileutil.FullExt(files[0]) - if fromAthena { - // Athena doesn't specify ".parquet" extension in output file names - // Append ".parquet" extension to the extension generated by Athena - format += ".parquet" + if iter.Format() != "" { + format += "." + iter.Format() } from, err := sourceReader(files, format, make(map[string]any)) diff --git a/runtime/drivers/duckdb/transporter/transporter_test.go b/runtime/drivers/duckdb/transporter/transporter_test.go index c82193bff8d..65855373be3 100644 --- a/runtime/drivers/duckdb/transporter/transporter_test.go +++ b/runtime/drivers/duckdb/transporter/transporter_test.go @@ -53,6 +53,10 @@ func (m *mockIterator) Size(unit drivers.ProgressUnit) (int64, bool) { func (m *mockIterator) KeepFilesUntilClose(keepFilesUntilClose bool) { } +func (m *mockIterator) Format() string { + return "" +} + var _ drivers.FileIterator = &mockIterator{} func TestIterativeCSVIngestionWithVariableSchema(t *testing.T) { diff --git a/runtime/drivers/object_store.go b/runtime/drivers/object_store.go index 48714fb8573..bb65d3b1904 100644 --- a/runtime/drivers/object_store.go +++ b/runtime/drivers/object_store.go @@ -27,4 +27,7 @@ type FileIterator interface { // KeepFilesUntilClose marks the iterator to keep the files until close is called. // This is used when the entire list of files is used at once in certain cases. KeepFilesUntilClose(keepFilesUntilClose bool) + // Format returns general file format (json, csv, parquet, etc) + // Returns an empty string if there is no general format + Format() string } From a0d18da0df0440974555312c7bf2fe94165a870c Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 23:18:04 +0300 Subject: [PATCH 38/40] deferred cleanupFn() --- runtime/drivers/athena/sql_store.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index 968dcaefcc1..7c882bbe363 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -27,7 +27,7 @@ func (c *Connection) Query(_ context.Context, _ map[string]any) (drivers.RowIter return nil, drivers.ErrNotImplemented } -func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ *drivers.QueryOption, _ drivers.Progress) (drivers.FileIterator, error) { +func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ *drivers.QueryOption, _ drivers.Progress) (outIt drivers.FileIterator, outErr error) { conf, err := parseSourceProperties(props) if err != nil { return nil, fmt.Errorf("failed to parse config: %w", err) @@ -64,12 +64,26 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * err = c.unload(ctx, client, conf, unloadLocation) if err != nil { - return nil, errors.Join(fmt.Errorf("failed to unload: %w", err), cleanupFn()) + unloadErr := fmt.Errorf("failed to unload: %w", err) + cleanupErr := cleanupFn() + if cleanupErr != nil { + cleanupErr = fmt.Errorf("cleanup error: %w", cleanupErr) + } + return nil, errors.Join(unloadErr, cleanupErr) } + defer func() { + if outErr != nil { + cleanupErr := cleanupFn() + if cleanupErr != nil { + outErr = errors.Join(outErr, fmt.Errorf("cleanup error: %w", cleanupErr)) + } + } + }() + bucketObj, err := openBucket(ctx, awsConfig, bucketName) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot open bucket %q: %w", bucketName, err), cleanupFn()) + return nil, fmt.Errorf("cannot open bucket %q: %w", bucketName, err) } opts := rillblob.Options{ @@ -79,7 +93,7 @@ func (c *Connection) QueryAsFiles(ctx context.Context, props map[string]any, _ * it, err := rillblob.NewIterator(ctx, bucketObj, opts, c.logger) if err != nil { - return nil, errors.Join(fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err), cleanupFn()) + return nil, fmt.Errorf("cannot download parquet output %q %w", opts.GlobPattern, err) } return autoDeleteFileIterator{ From a1d0eb946a57af489830966e901f944bf2b6c827 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 23:28:45 +0300 Subject: [PATCH 39/40] Aligned Athena query with a source config --- runtime/drivers/athena/sql_store.go | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/runtime/drivers/athena/sql_store.go b/runtime/drivers/athena/sql_store.go index 7c882bbe363..32b533961dc 100644 --- a/runtime/drivers/athena/sql_store.go +++ b/runtime/drivers/athena/sql_store.go @@ -129,20 +129,14 @@ func (c *Connection) unload(ctx context.Context, client *athena.Client, conf *so QueryString: aws.String(finalSQL), } - // If output_location is set, use it and don't set workgroup because the workgroup can override the output location - // Otherwise use specified workgroup or the "primary" workgroup - // see https://docs.aws.amazon.com/athena/latest/ug/querying.html if conf.OutputLocation != "" { executeParams.ResultConfiguration = &types2.ResultConfiguration{ OutputLocation: aws.String(conf.OutputLocation), } - } else { - workgroup := conf.Workgroup - if workgroup == "" { - // fallback to "primary" (default) workgroup if no workgroup is specified - workgroup = "primary" - } - executeParams.WorkGroup = aws.String(workgroup) + } + + if conf.Workgroup != "" { // primary is used if nothing is set + executeParams.WorkGroup = aws.String(conf.Workgroup) } queryExecutionOutput, err := client.StartQueryExecution(ctx, executeParams) From 480aa951c34a84a5d457a66932f49f66cd4cc676 Mon Sep 17 00:00:00 2001 From: "e.sevastyanov" <eugene.sevastianov@rilldata.com> Date: Mon, 25 Sep 2023 23:41:25 +0300 Subject: [PATCH 40/40] Fixed a merge conflict --- runtime/drivers/blob/blobdownloader.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/runtime/drivers/blob/blobdownloader.go b/runtime/drivers/blob/blobdownloader.go index bfa33219838..5674ba7b913 100644 --- a/runtime/drivers/blob/blobdownloader.go +++ b/runtime/drivers/blob/blobdownloader.go @@ -233,11 +233,7 @@ func (it *blobIterator) Next() ([]string, error) { return nil, io.EOF } - func (it *blobIterator) Format() string { - return it.opts.Format -} - -// Track the batch for cleanup in the next iteration + // Track the batch for cleanup in the next iteration it.lastBatch = batch // Clients may change the slice. Creating a copy to ensure we delete the files on next batch/close. @@ -246,6 +242,10 @@ func (it *blobIterator) Next() ([]string, error) { return result, nil } +func (it *blobIterator) Format() string { + return it.opts.Format +} + // TODO: Ideally planner should take ownership of the bucket and return an iterator with next returning objectWithPlan func (it *blobIterator) plan() ([]*objectWithPlan, error) { var ( @@ -469,6 +469,10 @@ func (it *prefetchedIterator) Next() ([]string, error) { return it.batch, nil } +func (it *prefetchedIterator) Format() string { + return it.underlying.Format() +} + // downloadResult represents a successfully downloaded file type downloadResult struct { path string