diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e6d6aac..2aba7da 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: go-version: '1.20' - uses: golangci/golangci-lint-action@v3 with: - version: v1.51.2 + version: v1.54.1 args: "--out-${NO_FUTURE}format colored-line-number" test: diff --git a/cmd/gpq/convert.go b/cmd/gpq/convert.go index 8aed67e..1fe336f 100644 --- a/cmd/gpq/convert.go +++ b/cmd/gpq/convert.go @@ -22,7 +22,6 @@ import ( "github.com/planetlabs/gpq/internal/geojson" "github.com/planetlabs/gpq/internal/geoparquet" - "github.com/segmentio/parquet-go" ) type ConvertCmd struct { @@ -33,7 +32,7 @@ type ConvertCmd struct { Min int `help:"Minimum number of features to consider when building a schema." default:"10"` Max int `help:"Maximum number of features to consider when building a schema." default:"100"` InputPrimaryColumn string `help:"Primary geometry column name when reading Parquet withtout metadata." default:"geometry"` - Compression string `help:"Parquet compression to use. Possible values: ${enum}." enum:"uncompressed, snappy, gzip, brotli, zstd, lz4raw" default:"gzip"` + Compression string `help:"Parquet compression to use. Possible values: ${enum}." enum:"uncompressed, snappy, gzip, brotli, zstd" default:"zstd"` } type FormatType string @@ -111,18 +110,8 @@ func (c *ConvertCmd) Run() error { return geojson.ToParquet(input, output, convertOptions) } - stat, statErr := os.Stat(c.Input) - if statErr != nil { - return fmt.Errorf("failed to get size of %q: %w", c.Input, statErr) - } - - file, fileErr := parquet.OpenFile(input, stat.Size()) - if fileErr != nil { - return fileErr - } - if outputFormat == GeoJSONType { - return geojson.FromParquet(file, output) + return geojson.FromParquet(input, output) } var convertOptions *geoparquet.ConvertOptions @@ -131,5 +120,6 @@ func (c *ConvertCmd) Run() error { InputPrimaryColumn: c.InputPrimaryColumn, } } - return geoparquet.FromParquet(file, output, convertOptions) + + return geoparquet.FromParquet(input, output, convertOptions) } diff --git a/cmd/gpq/describe.go b/cmd/gpq/describe.go index a0595ac..4eecc8b 100644 --- a/cmd/gpq/describe.go +++ b/cmd/gpq/describe.go @@ -22,10 +22,12 @@ import ( "strconv" "strings" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/schema" "github.com/jedib0t/go-pretty/v6/table" "github.com/jedib0t/go-pretty/v6/text" "github.com/planetlabs/gpq/internal/geoparquet" - "github.com/segmentio/parquet-go" "golang.org/x/term" ) @@ -53,29 +55,23 @@ func (c *DescribeCmd) Run() error { } defer input.Close() - stat, statErr := os.Stat(c.Input) - if statErr != nil { - return fmt.Errorf("failed to get size of %q: %w", c.Input, statErr) - } - - file, fileErr := parquet.OpenFile(input, stat.Size()) + fileReader, fileErr := file.NewParquetReader(input) if fileErr != nil { - return fileErr + return fmt.Errorf("failed to read %q as parquet: %w", c.Input, fileErr) } - metadata, geoErr := geoparquet.GetMetadata(file) + fileMetadata := fileReader.MetaData() + metadata, geoErr := geoparquet.GetMetadata(fileMetadata.KeyValueMetadata()) if geoErr != nil { if !errors.Is(geoErr, geoparquet.ErrNoMetadata) { return geoErr } } - schema := buildSchema("", file.Schema()) - info := &Info{ - Schema: schema, + Schema: buildSchema("", fileMetadata.Schema.Root()), Metadata: metadata, - NumRows: file.NumRows(), + NumRows: fileMetadata.NumRows, } if c.Format == "json" { @@ -211,47 +207,60 @@ type Schema struct { Fields []*Schema `json:"fields,omitempty"` } -func buildSchema(name string, node parquet.Node) *Schema { - nodeType := node.Type() +func buildSchema(name string, node schema.Node) *Schema { annotation := "" - if logicalType := nodeType.LogicalType(); logicalType != nil { - annotation = logicalType.String() + logicalType := node.LogicalType() + if !logicalType.IsNone() { + annotation = strings.ToLower(logicalType.String()) + } + + repetition := node.RepetitionType() + optional := false + repeated := false + if repetition == parquet.Repetitions.Optional { + optional = true + } else if repetition == parquet.Repetitions.Repeated { + repeated = true } field := &Schema{ Name: name, - Optional: node.Optional(), - Repeated: node.Repeated(), + Optional: optional, + Repeated: repeated, Annotation: annotation, } - if node.Leaf() { - switch nodeType.Kind() { - case parquet.Boolean: + if leaf, ok := node.(*schema.PrimitiveNode); ok { + switch leaf.PhysicalType() { + case parquet.Types.Boolean: field.Type = "boolean" - case parquet.Int32: + case parquet.Types.Int32: field.Type = "int32" - case parquet.Int64: + case parquet.Types.Int64: field.Type = "int64" - case parquet.Int96: + case parquet.Types.Int96: field.Type = "int96" - case parquet.Float: + case parquet.Types.Float: field.Type = "float" - case parquet.Double: + case parquet.Types.Double: field.Type = "double" - case parquet.ByteArray: + case parquet.Types.ByteArray: field.Type = "binary" - case parquet.FixedLenByteArray: - field.Type = fmt.Sprintf("fixed_len_byte_array(%d)", nodeType.Length()) + case parquet.Types.FixedLenByteArray: + field.Type = fmt.Sprintf("fixed_len_byte_array(%d)", leaf.TypeLength()) default: - field.Type = "unknown" + field.Type = leaf.PhysicalType().String() } return field } - field.Fields = make([]*Schema, len(node.Fields())) - for i, groupField := range node.Fields() { - field.Fields[i] = buildSchema(groupField.Name(), groupField) + if group, ok := node.(*schema.GroupNode); ok { + count := group.NumFields() + field.Fields = make([]*Schema, count) + for i := 0; i < count; i += 1 { + groupField := group.Field(i) + field.Fields[i] = buildSchema(groupField.Name(), groupField) + } } return field } diff --git a/cmd/wasm/main.go b/cmd/wasm/main.go index 2834260..25ed682 100644 --- a/cmd/wasm/main.go +++ b/cmd/wasm/main.go @@ -21,8 +21,10 @@ import ( "strings" "syscall/js" + "github.com/apache/arrow/go/v14/parquet/file" "github.com/planetlabs/gpq/internal/geojson" - "github.com/segmentio/parquet-go" + "github.com/planetlabs/gpq/internal/geoparquet" + "github.com/planetlabs/gpq/internal/pqutil" ) var uint8ArrayConstructor = js.Global().Get("Uint8Array") @@ -56,24 +58,28 @@ var fromParquet = js.FuncOf(func(this js.Value, args []js.Value) any { data := make([]byte, numBytes) js.CopyBytesToGo(data, args[0]) - input, fileErr := parquet.OpenFile(bytes.NewReader(data), int64(numBytes)) - if fileErr != nil { - return returnFromError(fileErr) - } - output := &bytes.Buffer{} - convertErr := geojson.FromParquet(input, output) + convertErr := geojson.FromParquet(bytes.NewReader(data), output) if convertErr != nil { return returnFromError(convertErr) } - metadata, _ := input.Lookup("geo") + reader, readerErr := file.NewParquetReader(bytes.NewReader(data)) + if readerErr != nil { + return returnFromError(readerErr) + } + defer reader.Close() + + metadata, metadataErr := geoparquet.GetMetadataValue(reader.MetaData().KeyValueMetadata()) + if metadataErr != nil { + return returnFromError(metadataErr) + } return returnFromValue(map[string]any{ "data": output.String(), "geo": metadata, - "schema": input.Schema().String(), - "records": input.NumRows(), + "schema": pqutil.ParquetSchemaString(reader.MetaData().Schema), + "records": reader.NumRows(), }) }) @@ -93,12 +99,15 @@ var toParquet = js.FuncOf(func(this js.Value, args []js.Value) any { return returnFromError(convertErr) } - file, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len())) - if err != nil { - return returnFromError(err) + reader, readerErr := file.NewParquetReader(bytes.NewReader(output.Bytes())) + if readerErr != nil { + return returnFromError(readerErr) } - metadata, _ := file.Lookup("geo") + metadata, metadataErr := geoparquet.GetMetadataValue(reader.MetaData().KeyValueMetadata()) + if metadataErr != nil { + return returnFromError(metadataErr) + } array := uint8ArrayConstructor.New(output.Len()) js.CopyBytesToJS(array, output.Bytes()) @@ -106,8 +115,8 @@ var toParquet = js.FuncOf(func(this js.Value, args []js.Value) any { return returnFromValue(map[string]any{ "data": array, "geo": metadata, - "schema": file.Schema().String(), - "records": file.NumRows(), + "schema": pqutil.ParquetSchemaString(reader.MetaData().Schema), + "records": reader.NumRows(), }) }) diff --git a/go.mod b/go.mod index a32e62f..89f4492 100644 --- a/go.mod +++ b/go.mod @@ -1,34 +1,50 @@ module github.com/planetlabs/gpq -go 1.20 +go 1.21 require ( github.com/alecthomas/kong v0.8.0 + github.com/apache/arrow/go/v14 v14.0.0-20230922164031-772a01c080ad github.com/fatih/color v1.15.0 github.com/jedib0t/go-pretty/v6 v6.4.7 github.com/paulmach/orb v0.10.0 github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 - github.com/segmentio/encoding v0.3.6 - github.com/segmentio/parquet-go v0.0.0-20230605165518-1fd7f3303070 github.com/stretchr/testify v1.8.4 golang.org/x/term v0.12.0 ) require ( - github.com/andybalholm/brotli v1.0.3 // indirect + github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect + github.com/andybalholm/brotli v1.0.5 // indirect + github.com/apache/thrift v0.17.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/google/go-cmp v0.5.9 // indirect - github.com/google/uuid v1.3.0 // indirect - github.com/klauspost/compress v1.15.9 // indirect + github.com/goccy/go-json v0.10.2 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/google/flatbuffers v23.5.26+incompatible // indirect + github.com/klauspost/asmfmt v1.3.2 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/klauspost/cpuid/v2 v2.2.5 // indirect github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.17 // indirect + github.com/mattn/go-isatty v0.0.19 // indirect github.com/mattn/go-runewidth v0.0.13 // indirect - github.com/olekukonko/tablewriter v0.0.5 // indirect - github.com/pierrec/lz4/v4 v4.1.9 // indirect + github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect + github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect + github.com/pierrec/lz4/v4 v4.1.18 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.2.0 // indirect - github.com/segmentio/asm v1.1.3 // indirect + github.com/zeebo/xxh3 v1.0.2 // indirect go.mongodb.org/mongo-driver v1.11.4 // indirect + golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect + golang.org/x/mod v0.12.0 // indirect + golang.org/x/net v0.15.0 // indirect + golang.org/x/sync v0.3.0 // indirect golang.org/x/sys v0.12.0 // indirect + golang.org/x/text v0.13.0 // indirect + golang.org/x/tools v0.13.0 // indirect + golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect + google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect + google.golang.org/grpc v1.54.0 // indirect + google.golang.org/protobuf v1.31.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index c3a36de..fa85225 100644 --- a/go.sum +++ b/go.sum @@ -1,17 +1,33 @@ +github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU= +github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= github.com/alecthomas/assert/v2 v2.1.0 h1:tbredtNcQnoSd3QBhQWI7QZ3XHOVkw1Moklp2ojoH/0= +github.com/alecthomas/assert/v2 v2.1.0/go.mod h1:b/+1DI2Q6NckYi+3mXyH3wFb8qG37K/DuK80n7WefXA= github.com/alecthomas/kong v0.8.0 h1:ryDCzutfIqJPnNn0omnrgHLbAggDQM2VWHikE1xqK7s= github.com/alecthomas/kong v0.8.0/go.mod h1:n1iCIO2xS46oE8ZfYCNDqdR0b0wZNrXAIAqro/2132U= github.com/alecthomas/repr v0.1.0 h1:ENn2e1+J3k09gyj2shc0dHr/yjaWSHRlrJ4DPMevDqE= -github.com/andybalholm/brotli v1.0.3 h1:fpcw+r1N1h0Poc1F/pHbW40cUm/lMEQslZtCkBQ0UnM= -github.com/andybalholm/brotli v1.0.3/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= +github.com/alecthomas/repr v0.1.0/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= +github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= +github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= +github.com/apache/arrow/go/v14 v14.0.0-20230922164031-772a01c080ad h1:SR1hiquYLx7Z3uM+t0lwcpDIVMyGpz29ZRUOBl5qz/s= +github.com/apache/arrow/go/v14 v14.0.0-20230922164031-772a01c080ad/go.mod h1:/SqmdO2dsWqFHqQQeupnsr0ollL8C91n3x0I72rArY8= +github.com/apache/thrift v0.17.0 h1:cMd2aj52n+8VoAtvSvLn4kDC3aZ6IAkBuqWQ2IDu7wo= +github.com/apache/thrift v0.17.0/go.mod h1:OLxhMRJxomX+1I/KUw03qoV3mMz16BwaKI+d4fPBx7Q= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs= github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw= +github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= +github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8ioaQmyPLg1b8VwK5WJg= +github.com/google/flatbuffers v23.5.26+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= @@ -19,34 +35,41 @@ github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= github.com/jedib0t/go-pretty/v6 v6.4.7 h1:lwiTJr1DEkAgzljsUsORmWsVn5MQjt1BPJdPCtJ6KXE= github.com/jedib0t/go-pretty/v6 v6.4.7/go.mod h1:Ndk3ase2CkQbXLLNf5QDHoYb6J9WtVfmHZu9n8rk2xs= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= +github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY= -github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= +github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= -github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU= github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= -github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= -github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/paulmach/orb v0.10.0 h1:guVYVqzxHE/CQ1KpfGO077TR0ATHSNjp4s6XGLn3W9s= github.com/paulmach/orb v0.10.0/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU= github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY= -github.com/pierrec/lz4/v4 v4.1.9 h1:xkrjwpOP5xg1k4Nn4GX4a4YFGhscyQL/3EddJ1Xxqm8= -github.com/pierrec/lz4/v4 v4.1.9/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= +github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/profile v1.6.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -55,14 +78,10 @@ github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1 h1:lZUw3E0/J3roVtGQ+SCrUrg3ON6NgVqpn3+iol9aGu4= github.com/santhosh-tekuri/jsonschema/v5 v5.3.1/go.mod h1:uToXkOrWAZ6/Oc07xWQrPOhJotwFIyu2bBVN41fcDUY= -github.com/segmentio/asm v1.1.3 h1:WM03sfUOENvvKexOLp+pCqgb/WDjsi7EK8gIsICtzhc= -github.com/segmentio/asm v1.1.3/go.mod h1:Ld3L4ZXGNcSLRg4JBsZ3//1+f/TjYl0Mzen/DQy1EJg= -github.com/segmentio/encoding v0.3.6 h1:E6lVLyDPseWEulBmCmAKPanDd3jiyGDo5gMcugCRwZQ= -github.com/segmentio/encoding v0.3.6/go.mod h1:n0JeuIqEQrQoPDGsjo8UNd1iA0U8d8+oHAA4E3G3OxM= -github.com/segmentio/parquet-go v0.0.0-20230605165518-1fd7f3303070 h1:vMhjcOmrl9K8nFop55fNg/72P6rGDHv5UAnprFU/vfI= -github.com/segmentio/parquet-go v0.0.0-20230605165518-1fd7f3303070/go.mod h1:+J0xQnJjm8DuQUHBO7t57EnmPbstT6+b45+p3DC9k1Q= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= @@ -76,32 +95,45 @@ github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgk github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= go.mongodb.org/mongo-driver v1.11.4 h1:4ayjakA013OdpGyL2K3ZqylTac/rMjrJOMZ1EHizXas= go.mongodb.org/mongo-driver v1.11.4/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9 h1:GoHiUyI/Tp2nVkLI2mCxVkOjsbSXD66ic0XW0js0R9g= +golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqRShND87VCbxmc6bL1Yd2oYrm6k= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211110154304-99a53858aa08/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -111,17 +143,31 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.13.0 h1:Iey4qkscZuv0VvIt8E0neZjtPVQFSc870HQ448QgEmQ= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= +golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= +gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o= +gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY= +google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= +google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= +google.golang.org/grpc v1.54.0 h1:EhTqbhiYeixwWQtAEZAxmV9MGqcjEU2mFx52xCzNyag= +google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/geo/geo.go b/internal/geo/geo.go new file mode 100644 index 0000000..97e831c --- /dev/null +++ b/internal/geo/geo.go @@ -0,0 +1,316 @@ +package geo + +import ( + "encoding/json" + "fmt" + "math" + "sync" + + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" + "github.com/paulmach/orb/encoding/wkt" + orbjson "github.com/paulmach/orb/geojson" +) + +type Feature struct { + Id any `json:"id,omitempty"` + Type string `json:"type"` + Geometry orb.Geometry `json:"geometry"` + Properties map[string]any `json:"properties"` +} + +var ( + _ json.Marshaler = (*Feature)(nil) + _ json.Unmarshaler = (*Feature)(nil) +) + +func (f *Feature) MarshalJSON() ([]byte, error) { + m := map[string]any{ + "type": "Feature", + "geometry": orbjson.NewGeometry(f.Geometry), + "properties": f.Properties, + } + if f.Id != nil { + m["id"] = f.Id + } + return json.Marshal(m) +} + +type jsonFeature struct { + Id any `json:"id,omitempty"` + Type string `json:"type"` + Geometry json.RawMessage `json:"geometry"` + Properties map[string]any `json:"properties"` +} + +var rawNull = json.RawMessage([]byte("null")) + +func isRawNull(raw json.RawMessage) bool { + if len(raw) != len(rawNull) { + return false + } + for i, c := range raw { + if c != rawNull[i] { + return false + } + } + return true +} + +func (f *Feature) UnmarshalJSON(data []byte) error { + jf := &jsonFeature{} + if err := json.Unmarshal(data, jf); err != nil { + return err + } + + f.Type = jf.Type + f.Id = jf.Id + f.Properties = jf.Properties + + if isRawNull(jf.Geometry) { + return nil + } + geometry := &orbjson.Geometry{} + if err := json.Unmarshal(jf.Geometry, geometry); err != nil { + return err + } + + f.Geometry = geometry.Geometry() + return nil +} + +const ( + EncodingWKB = "WKB" + EncodingWKT = "WKT" +) + +func DecodeGeometry(value any, encoding string) (*orbjson.Geometry, error) { + if value == nil { + return nil, nil + } + if encoding == "" { + if _, ok := value.([]byte); ok { + encoding = EncodingWKB + } else if _, ok := value.(string); ok { + encoding = EncodingWKT + } + } + if encoding == EncodingWKB { + data, ok := value.([]byte) + if !ok { + return nil, fmt.Errorf("expected bytes for wkb geometry, got %T", value) + } + g, err := wkb.Unmarshal(data) + if err != nil { + return nil, err + } + return orbjson.NewGeometry(g), nil + } + if encoding == EncodingWKT { + str, ok := value.(string) + if !ok { + return nil, fmt.Errorf("expected string for wkt geometry, got %T", value) + } + g, err := wkt.Unmarshal(str) + if err != nil { + return nil, err + } + return orbjson.NewGeometry(g), nil + } + return nil, fmt.Errorf("unsupported encoding: %s", encoding) +} + +type CollectionInfo struct { + mutex *sync.RWMutex + minX float64 + maxX float64 + minY float64 + maxY float64 + types map[string]bool +} + +func NewCollectionInfo(concurrent bool) *CollectionInfo { + var mutex *sync.RWMutex + if concurrent { + mutex = &sync.RWMutex{} + } + return &CollectionInfo{ + mutex: mutex, + types: map[string]bool{}, + minX: math.MaxFloat64, + maxX: -math.MaxFloat64, + minY: math.MaxFloat64, + maxY: -math.MaxFloat64, + } +} + +func (i *CollectionInfo) writeLock() { + if i.mutex == nil { + return + } + i.mutex.Lock() +} + +func (i *CollectionInfo) writeUnlock() { + if i.mutex == nil { + return + } + i.mutex.Unlock() +} + +func (i *CollectionInfo) readLock() { + if i.mutex == nil { + return + } + i.mutex.RLock() +} + +func (i *CollectionInfo) readUnlock() { + if i.mutex == nil { + return + } + i.mutex.RUnlock() +} + +func (i *CollectionInfo) AddBounds(bounds *orb.Bound) { + i.writeLock() + minPoint := bounds.Min + minX := minPoint[0] + minY := minPoint[1] + maxPoint := bounds.Max + maxX := maxPoint[0] + maxY := maxPoint[1] + i.minX = math.Min(i.minX, minX) + i.maxX = math.Max(i.maxX, maxX) + i.minY = math.Min(i.minY, minY) + i.maxY = math.Max(i.maxY, maxY) + i.writeUnlock() +} + +func (i *CollectionInfo) Bounds() *orb.Bound { + i.readLock() + bounds := &orb.Bound{ + Min: orb.Point{i.minX, i.minY}, + Max: orb.Point{i.maxX, i.maxY}, + } + i.readUnlock() + return bounds +} + +func (i *CollectionInfo) AddType(typ string) { + i.writeLock() + i.types[typ] = true + i.writeUnlock() +} + +func (i *CollectionInfo) AddTypes(types []string) { + i.writeLock() + for _, typ := range types { + i.types[typ] = true + } + i.writeUnlock() +} + +func (i *CollectionInfo) Types() []string { + i.readLock() + types := []string{} + for typ, ok := range i.types { + if ok { + types = append(types, typ) + } + } + i.readUnlock() + return types +} + +type DatasetInfo struct { + mutex *sync.RWMutex + collections map[string]*CollectionInfo +} + +func NewDatasetInfo(concurrent bool) *DatasetInfo { + var mutex *sync.RWMutex + if concurrent { + mutex = &sync.RWMutex{} + } + return &DatasetInfo{ + mutex: mutex, + collections: map[string]*CollectionInfo{}, + } +} + +func (i *DatasetInfo) writeLock() { + if i.mutex == nil { + return + } + i.mutex.Lock() +} + +func (i *DatasetInfo) writeUnlock() { + if i.mutex == nil { + return + } + i.mutex.Unlock() +} + +func (i *DatasetInfo) readLock() { + if i.mutex == nil { + return + } + i.mutex.RLock() +} + +func (i *DatasetInfo) readUnlock() { + if i.mutex == nil { + return + } + i.mutex.RUnlock() +} + +func (i *DatasetInfo) NumCollections() int { + i.readLock() + num := len(i.collections) + i.readUnlock() + return num +} + +func (i *DatasetInfo) AddCollection(name string) { + i.writeLock() + i.collections[name] = NewCollectionInfo(i.mutex != nil) + i.writeUnlock() +} + +func (i *DatasetInfo) HasCollection(name string) bool { + i.readLock() + _, has := i.collections[name] + i.readUnlock() + return has +} + +func (i *DatasetInfo) AddBounds(name string, bounds *orb.Bound) { + i.readLock() + collection := i.collections[name] + i.readUnlock() + collection.AddBounds(bounds) +} + +func (i *DatasetInfo) Bounds(name string) *orb.Bound { + i.readLock() + collection := i.collections[name] + i.readUnlock() + return collection.Bounds() +} + +func (i *DatasetInfo) AddTypes(name string, types []string) { + i.readLock() + collection := i.collections[name] + i.readUnlock() + collection.AddTypes(types) +} + +func (i *DatasetInfo) Types(name string) []string { + i.readLock() + collection := i.collections[name] + i.readUnlock() + return collection.Types() +} diff --git a/internal/geojson/featurereader.go b/internal/geojson/featurereader.go new file mode 100644 index 0000000..368c2f8 --- /dev/null +++ b/internal/geojson/featurereader.go @@ -0,0 +1,273 @@ +package geojson + +import ( + "encoding/json" + "errors" + "fmt" + "io" + + "github.com/paulmach/orb" + orbjson "github.com/paulmach/orb/geojson" + "github.com/planetlabs/gpq/internal/geo" +) + +type FeatureReader struct { + collection bool + decoder *json.Decoder +} + +func NewFeatureReader(input io.Reader) *FeatureReader { + return &FeatureReader{ + decoder: json.NewDecoder(input), + } +} + +func (r *FeatureReader) Read() (*geo.Feature, error) { + if r.decoder == nil { + return nil, io.EOF + } + + if r.collection { + return r.readFeature() + } + + defer func() { + if !r.collection { + r.decoder = nil + } + }() + + token, err := r.decoder.Token() + if err == io.EOF { + return nil, io.EOF + } + if err != nil { + return nil, err + } + + delim, ok := token.(json.Delim) + if !ok || delim != json.Delim('{') { + return nil, fmt.Errorf("expected a JSON object, got %s", token) + } + + var parsedType string + var feature *geo.Feature + var coordinatesJSON json.RawMessage + for { + keyToken, keyErr := r.decoder.Token() + if keyErr == io.EOF { + if feature == nil { + return nil, io.EOF + } + return feature, nil + } + if keyErr != nil { + return nil, keyErr + } + + delim, ok := keyToken.(json.Delim) + if ok && delim == json.Delim('}') { + if feature == nil { + return nil, errors.New("expected a FeatureCollection, a Feature, or a Geometry object") + } + return feature, nil + } + + key, ok := keyToken.(string) + if !ok { + return nil, fmt.Errorf("unexpected token: %s", token) + } + + if key == "geometry" { + if feature == nil { + feature = &geo.Feature{} + } else if feature.Geometry != nil { + return nil, errors.New("found duplicate geometry") + } + geometry := &orbjson.Geometry{} + if err := r.decoder.Decode(geometry); err != nil { + return nil, fmt.Errorf("trouble parsing geometry: %w", err) + } + feature.Geometry = geometry.Geometry() + continue + } + + if key == "properties" { + if feature == nil { + feature = &geo.Feature{} + } else if feature.Properties != nil { + return nil, errors.New("found duplicate properties") + } + properties := map[string]any{} + if err := r.decoder.Decode(&properties); err != nil { + return nil, fmt.Errorf("trouble parsing properties: %w", err) + } + feature.Properties = properties + continue + } + + if key == "coordinates" { + if feature == nil { + feature = &geo.Feature{} + } else if feature.Geometry != nil { + return nil, errors.New("found unexpected coordinates") + } + if coordinatesJSON != nil { + return nil, errors.New("found duplicate coordinates") + } + if err := r.decoder.Decode(&coordinatesJSON); err != nil { + return nil, fmt.Errorf("trouble parsing coordinates") + } + if parsedType != "" { + return r.featureFromCoordinates(parsedType, coordinatesJSON) + } + continue + } + + valueToken, valueErr := r.decoder.Token() + if valueErr != nil { + return nil, valueErr + } + + if key == "type" { + if parsedType != "" { + return nil, errors.New("found duplicate type") + } + value, ok := valueToken.(string) + if !ok { + return nil, fmt.Errorf("unexpected type: %s", valueToken) + } + parsedType = value + if coordinatesJSON != nil { + return r.featureFromCoordinates(parsedType, coordinatesJSON) + } + continue + } + + if key == "features" { + if parsedType != "" && parsedType != "FeatureCollection" { + return nil, fmt.Errorf("found features in unexpected %q type", parsedType) + } + delim, ok := valueToken.(json.Delim) + if !ok || delim != json.Delim('[') { + return nil, fmt.Errorf("expected an array of features, got %s", token) + } + r.collection = true + return r.readFeature() + } + + if key == "geometries" { + if parsedType != "" && parsedType != "GeometryCollection" { + return nil, fmt.Errorf("found geometries in unexpected %q type", parsedType) + } + delim, ok := valueToken.(json.Delim) + if !ok || delim != json.Delim('[') { + return nil, fmt.Errorf("expected an array of geometries, got %s", token) + } + return r.readGeometryCollection() + } + + if key == "id" { + if feature == nil { + feature = &geo.Feature{} + } else if feature.Id != nil { + return nil, errors.New("found duplicate id") + } + _, stringId := valueToken.(string) + _, floatId := valueToken.(float64) + if !(stringId || floatId) { + return nil, fmt.Errorf("expected id to be a string or number, got: %v", valueToken) + } + feature.Id = valueToken + continue + } + + if delim, ok := valueToken.(json.Delim); ok { + switch delim { + case json.Delim('['): + err := r.scanToMatching('[', ']') + if err != nil { + return nil, err + } + case json.Delim('{'): + err := r.scanToMatching('{', '}') + if err != nil { + return nil, err + } + default: + return nil, fmt.Errorf("unexpected token: %s", delim) + } + } + } +} + +func (r *FeatureReader) scanToMatching(fromDelim json.Delim, toDelim json.Delim) error { + depth := 1 + for { + token, err := r.decoder.Token() + if err != nil { + return fmt.Errorf("unexpected token: %w", err) + } + delim, ok := token.(json.Delim) + if !ok { + continue + } + if delim == fromDelim { + depth += 1 + continue + } + if delim == toDelim { + depth -= 1 + if depth == 0 { + return nil + } + } + } +} + +func (r *FeatureReader) featureFromCoordinates(geometryType string, coordinatesJSON json.RawMessage) (*geo.Feature, error) { + prefix := []byte(`{"type":"` + geometryType + `","coordinates":`) + geometryData := append(prefix, coordinatesJSON...) + geometryData = append(geometryData, "}"...) + geometry := &orbjson.Geometry{} + if err := json.Unmarshal(geometryData, geometry); err != nil { + return nil, fmt.Errorf("trouble parsing geometry coordinates: %w", err) + } + feature := &geo.Feature{ + Geometry: geometry.Geometry(), + Properties: map[string]any{}, + } + return feature, nil +} + +func (r *FeatureReader) readFeature() (*geo.Feature, error) { + if !r.decoder.More() { + r.decoder = nil + return nil, io.EOF + } + feature := &geo.Feature{} + if err := r.decoder.Decode(feature); err != nil { + return nil, err + } + return feature, nil +} + +func (r *FeatureReader) readGeometryCollection() (*geo.Feature, error) { + feature := &geo.Feature{Properties: map[string]any{}} + + if !r.decoder.More() { + return feature, nil + } + + geometries := []orb.Geometry{} + for r.decoder.More() { + geometry := &orbjson.Geometry{} + if err := r.decoder.Decode(geometry); err != nil { + return nil, fmt.Errorf("trouble parsing geometry: %w", err) + } + geometries = append(geometries, geometry.Geometry()) + } + + feature.Geometry = orb.Collection(geometries) + return feature, nil +} diff --git a/internal/geojson/featurereader_test.go b/internal/geojson/featurereader_test.go new file mode 100644 index 0000000..ee02c4f --- /dev/null +++ b/internal/geojson/featurereader_test.go @@ -0,0 +1,141 @@ +package geojson_test + +import ( + "io" + "os" + "testing" + + "github.com/paulmach/orb" + "github.com/planetlabs/gpq/internal/geo" + "github.com/planetlabs/gpq/internal/geojson" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestFeatureReader(t *testing.T) { + file, openErr := os.Open("testdata/example.geojson") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + features := []*geo.Feature{} + for { + feature, err := reader.Read() + if err == io.EOF { + break + } + require.NoError(t, err) + features = append(features, feature) + } + require.Len(t, features, 5) + + fiji := features[0] + assert.NotNil(t, fiji.Geometry) + assert.Equal(t, "Oceania", fiji.Properties["continent"]) + assert.Equal(t, float64(920938), fiji.Properties["pop_est"]) + + usa := features[4] + assert.NotNil(t, usa.Geometry) + assert.Equal(t, "North America", usa.Properties["continent"]) + assert.Equal(t, float64(326625791), usa.Properties["pop_est"]) +} + +func TestFeatureReaderPointGeometry(t *testing.T) { + file, openErr := os.Open("testdata/point-geometry.geojson") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + features := []*geo.Feature{} + for { + feature, err := reader.Read() + if err == io.EOF { + break + } + require.NoError(t, err) + features = append(features, feature) + } + require.Len(t, features, 1) + + feature := features[0] + require.NotNil(t, feature.Geometry) + assert.Equal(t, "Point", feature.Geometry.GeoJSONType()) + point, ok := feature.Geometry.(orb.Point) + require.True(t, ok) + assert.True(t, point.Equal(orb.Point{1, 2})) + assert.Len(t, feature.Properties, 0) +} + +func TestFeatureReaderSingleFeature(t *testing.T) { + file, openErr := os.Open("testdata/feature.geojson") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + features := []*geo.Feature{} + for { + feature, err := reader.Read() + if err == io.EOF { + break + } + require.NoError(t, err) + features = append(features, feature) + } + require.Len(t, features, 1) + + feature := features[0] + require.NotNil(t, feature.Geometry) + assert.Equal(t, "Point", feature.Geometry.GeoJSONType()) + point, ok := feature.Geometry.(orb.Point) + require.True(t, ok) + assert.True(t, point.Equal(orb.Point{1, 2})) + assert.Equal(t, map[string]any{"name": "test"}, feature.Properties) +} + +func TestFeatureReaderEmptyFeatureCollection(t *testing.T) { + file, openErr := os.Open("testdata/empty-collection.geojson") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + feature, err := reader.Read() + assert.Nil(t, feature) + assert.Equal(t, io.EOF, err) +} + +func TestFeatureReaderBadCollection(t *testing.T) { + file, openErr := os.Open("testdata/bad-collection.geojson") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + feature, noErr := reader.Read() + assert.NotNil(t, feature) + assert.NoError(t, noErr) + + noFeature, err := reader.Read() + require.Nil(t, noFeature) + require.EqualError(t, err, "geojson: invalid geometry") +} + +func TestFeatureReaderNotGeoJSON(t *testing.T) { + file, openErr := os.Open("testdata/not-geojson.json") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + feature, err := reader.Read() + assert.Nil(t, feature) + assert.EqualError(t, err, "expected a FeatureCollection, a Feature, or a Geometry object") +} + +func TestFeatureReaderNotGeoJSONArray(t *testing.T) { + file, openErr := os.Open("testdata/array.json") + require.NoError(t, openErr) + + reader := geojson.NewFeatureReader(file) + + feature, err := reader.Read() + assert.Nil(t, feature) + assert.EqualError(t, err, "expected a JSON object, got [") +} diff --git a/internal/geojson/geojson.go b/internal/geojson/geojson.go index 9ae8b8b..bfeb93a 100644 --- a/internal/geojson/geojson.go +++ b/internal/geojson/geojson.go @@ -1,512 +1,60 @@ -// Copyright 2023 Planet Labs PBC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - package geojson import ( - "encoding/json" - "errors" "fmt" "io" - "reflect" - "github.com/paulmach/orb" - orbjson "github.com/paulmach/orb/geojson" + "github.com/apache/arrow/go/v14/parquet" + "github.com/planetlabs/gpq/internal/geo" "github.com/planetlabs/gpq/internal/geoparquet" - "github.com/segmentio/parquet-go" + "github.com/planetlabs/gpq/internal/pqutil" ) -type FeatureWriter struct { - writer io.Writer - schema *parquet.Schema - metadata *geoparquet.Metadata - writing bool -} - -func NewFeatureWriter(writer io.Writer, metadata *geoparquet.Metadata, schema *parquet.Schema) (*FeatureWriter, error) { - featureWriter := &FeatureWriter{ - writer: writer, - schema: schema, - metadata: metadata, - } - return featureWriter, nil -} - -func toFeature(row parquet.Row, schema *parquet.Schema, metadata *geoparquet.Metadata) (*Feature, error) { - properties := map[string]any{} - if err := schema.Reconstruct(&properties, row); err != nil { - return nil, err - } - - var primaryGeometry orb.Geometry - - for name := range metadata.Columns { - value, ok := properties[name] - if !ok { - return nil, fmt.Errorf("missing geometry column: %s", name) - } - - if name == metadata.PrimaryColumn { - delete(properties, name) - } - if value == nil { - continue - } - - geometry, _, err := geoparquet.Geometry(value, name, metadata, schema) - if err != nil { - return nil, err - } - - if name == metadata.PrimaryColumn { - primaryGeometry = geometry - } else { - properties[name] = geometry - } - } - - feature := &Feature{ - Properties: properties, - Geometry: primaryGeometry, - } - return feature, nil -} - -func (w *FeatureWriter) Write(row parquet.Row) error { - if !w.writing { - _, err := io.WriteString(w.writer, `{"type":"FeatureCollection","features":[`) - if err != nil { - return err - } - w.writing = true - } else { - _, err := io.WriteString(w.writer, ",") - if err != nil { - return err - } - } - - feature, err := toFeature(row, w.schema, w.metadata) - if err != nil { - return err - } - - encoder := json.NewEncoder(w.writer) - return encoder.Encode(feature) -} +const primaryColumn = "geometry" -func (w *FeatureWriter) Close() error { - if w.writing { - _, err := io.WriteString(w.writer, "]}") - if err != nil { - return err - } - w.writing = false - } else { - _, err := io.WriteString(w.writer, `{"type":"FeatureCollection","features":[]}`) - if err != nil { - return err - } - } - return nil -} - -type Feature struct { - Id any `json:"id,omitempty"` - Type string `json:"type"` - Geometry orb.Geometry `json:"geometry"` - Properties map[string]any `json:"properties"` -} - -func (f *Feature) MarshalJSON() ([]byte, error) { - m := map[string]any{ - "type": "Feature", - "geometry": orbjson.NewGeometry(f.Geometry), - "properties": f.Properties, - } - if f.Id != nil { - m["id"] = f.Id - } - return json.Marshal(m) -} - -type jsonFeature struct { - Id any `json:"id,omitempty"` - Type string `json:"type"` - Geometry json.RawMessage `json:"geometry"` - Properties map[string]any `json:"properties"` -} - -var rawNull = json.RawMessage{'n', 'u', 'l', 'l'} - -func isRawNull(raw json.RawMessage) bool { - if len(raw) != len(rawNull) { - return false - } - for i, c := range raw { - if c != rawNull[i] { - return false - } +func GetDefaultMetadata() *geoparquet.Metadata { + return &geoparquet.Metadata{ + Version: geoparquet.Version, + PrimaryColumn: primaryColumn, + Columns: map[string]*geoparquet.GeometryColumn{ + primaryColumn: { + Encoding: "WKB", + GeometryTypes: []string{}, + }, + }, } - return true } -func (f *Feature) UnmarshalJSON(data []byte) error { - jf := &jsonFeature{} - if err := json.Unmarshal(data, jf); err != nil { - return err +func FromParquet(reader parquet.ReaderAtSeeker, writer io.Writer) error { + recordReader, rrErr := geoparquet.NewRecordReader(&geoparquet.ReaderConfig{ + Reader: reader, + }) + if rrErr != nil { + return rrErr } + defer recordReader.Close() - f.Type = jf.Type - f.Id = jf.Id - f.Properties = jf.Properties - - if isRawNull(jf.Geometry) { - return nil - } - geometry := &orbjson.Geometry{} - if err := json.Unmarshal(jf.Geometry, geometry); err != nil { - return err - } - - f.Geometry = geometry.Geometry() - return nil -} - -func FromParquet(file *parquet.File, writer io.Writer) error { - rowReader := geoparquet.NewRowReader(file) - - metadata, geoErr := geoparquet.GetMetadata(file) - if geoErr != nil { - return geoErr - } + geoMetadata := recordReader.Metadata() - featureWriter, writerErr := NewFeatureWriter(writer, metadata, file.Schema()) - if writerErr != nil { - return writerErr + jsonWriter, jsonErr := NewRecordWriter(writer, geoMetadata) + if jsonErr != nil { + return jsonErr } for { - row, readErr := rowReader.Next() + record, readErr := recordReader.Read() if readErr == io.EOF { break } if readErr != nil { return readErr } - - writeErr := featureWriter.Write(row) - if writeErr != nil { - return writeErr - } - } - - return featureWriter.Close() -} - -type FeatureReader struct { - buffer []*Feature - collection bool - decoder *json.Decoder -} - -func NewFeatureReader(input io.Reader) *FeatureReader { - return &FeatureReader{ - decoder: json.NewDecoder(input), - } -} - -func (reader *FeatureReader) Converter(min int, max int) (*TypeConverter, error) { - features := []*Feature{} - schemaBuilder := &SchemaBuilder{} - for attempts := 0; attempts < max-1; attempts += 1 { - feature, readErr := reader.Next() - if readErr == io.EOF { - if attempts == 0 { - return nil, errors.New("empty feature collection") - } - reader.buffer = features - return schemaBuilder.Converter() - } - if readErr != nil { - return nil, readErr - } - features = append(features, feature) - - if complete := schemaBuilder.Add(feature); complete && attempts >= min-1 { - reader.buffer = features - return schemaBuilder.Converter() - } - } - return nil, fmt.Errorf("failed to generate converter from first %d features", max) -} - -func (r *FeatureReader) Next() (*Feature, error) { - if len(r.buffer) > 0 { - feature := r.buffer[0] - r.buffer = r.buffer[1:] - return feature, nil - } - - if r.decoder == nil { - return nil, io.EOF - } - - if r.collection { - return r.readFeature() - } - - defer func() { - if !r.collection { - r.decoder = nil - } - }() - - token, err := r.decoder.Token() - if err == io.EOF { - return nil, io.EOF - } - if err != nil { - return nil, err - } - - delim, ok := token.(json.Delim) - if !ok || delim != json.Delim('{') { - return nil, fmt.Errorf("expected a JSON object, got %s", token) - } - - var parsedType string - var feature *Feature - var coordinatesJSON json.RawMessage - for { - keyToken, keyErr := r.decoder.Token() - if keyErr == io.EOF { - if feature == nil { - return nil, io.EOF - } - return feature, nil - } - if keyErr != nil { - return nil, keyErr - } - - delim, ok := keyToken.(json.Delim) - if ok && delim == json.Delim('}') { - if feature == nil { - return nil, errors.New("expected a FeatureCollection, a Feature, or a Geometry object") - } - return feature, nil - } - - key, ok := keyToken.(string) - if !ok { - return nil, fmt.Errorf("unexpected token: %s", token) - } - - if key == "geometry" { - if feature == nil { - feature = &Feature{} - } else if feature.Geometry != nil { - return nil, errors.New("found duplicate geometry") - } - geometry := &orbjson.Geometry{} - if err := r.decoder.Decode(geometry); err != nil { - return nil, fmt.Errorf("trouble parsing geometry: %w", err) - } - feature.Geometry = geometry.Geometry() - continue - } - - if key == "properties" { - if feature == nil { - feature = &Feature{} - } else if feature.Properties != nil { - return nil, errors.New("found duplicate properties") - } - properties := map[string]any{} - if err := r.decoder.Decode(&properties); err != nil { - return nil, fmt.Errorf("trouble parsing properties: %w", err) - } - feature.Properties = properties - continue - } - - if key == "coordinates" { - if feature == nil { - feature = &Feature{} - } else if feature.Geometry != nil { - return nil, errors.New("found unexpected coordinates") - } - if coordinatesJSON != nil { - return nil, errors.New("found duplicate coordinates") - } - if err := r.decoder.Decode(&coordinatesJSON); err != nil { - return nil, fmt.Errorf("trouble parsing coordinates") - } - if parsedType != "" { - return r.featureFromCoordinates(parsedType, coordinatesJSON) - } - continue - } - - valueToken, valueErr := r.decoder.Token() - if valueErr != nil { - return nil, valueErr - } - - if key == "type" { - if parsedType != "" { - return nil, errors.New("found duplicate type") - } - value, ok := valueToken.(string) - if !ok { - return nil, fmt.Errorf("unexpected type: %s", valueToken) - } - parsedType = value - if coordinatesJSON != nil { - return r.featureFromCoordinates(parsedType, coordinatesJSON) - } - continue - } - - if key == "features" { - if parsedType != "" && parsedType != "FeatureCollection" { - return nil, fmt.Errorf("found features in unexpected %q type", parsedType) - } - delim, ok := valueToken.(json.Delim) - if !ok || delim != json.Delim('[') { - return nil, fmt.Errorf("expected an array of features, got %s", token) - } - r.collection = true - return r.readFeature() - } - - if key == "geometries" { - if parsedType != "" && parsedType != "GeometryCollection" { - return nil, fmt.Errorf("found geometries in unexpected %q type", parsedType) - } - delim, ok := valueToken.(json.Delim) - if !ok || delim != json.Delim('[') { - return nil, fmt.Errorf("expected an array of geometries, got %s", token) - } - return r.readGeometryCollection() - } - - if key == "id" { - if feature == nil { - feature = &Feature{} - } else if feature.Id != nil { - return nil, errors.New("found duplicate id") - } - _, stringId := valueToken.(string) - _, floatId := valueToken.(float64) - if !(stringId || floatId) { - return nil, fmt.Errorf("expected id to be a string or number, got: %v", valueToken) - } - feature.Id = valueToken - continue - } - - if delim, ok := valueToken.(json.Delim); ok { - switch delim { - case json.Delim('['): - err := r.scanToMatching('[', ']') - if err != nil { - return nil, err - } - case json.Delim('{'): - err := r.scanToMatching('{', '}') - if err != nil { - return nil, err - } - default: - return nil, fmt.Errorf("unexpected token: %s", delim) - } - } - - } -} - -func (r *FeatureReader) scanToMatching(fromDelim json.Delim, toDelim json.Delim) error { - depth := 1 - for { - token, err := r.decoder.Token() - if err != nil { - return fmt.Errorf("unexpected token: %w", err) - } - delim, ok := token.(json.Delim) - if !ok { - continue - } - if delim == fromDelim { - depth += 1 - continue - } - if delim == toDelim { - depth -= 1 - if depth == 0 { - return nil - } - } - } -} - -func (r *FeatureReader) featureFromCoordinates(geometryType string, coordinatesJSON json.RawMessage) (*Feature, error) { - prefix := []byte(`{"type":"` + geometryType + `","coordinates":`) - geometryData := append(prefix, coordinatesJSON...) - geometryData = append(geometryData, "}"...) - geometry := &orbjson.Geometry{} - if err := json.Unmarshal(geometryData, geometry); err != nil { - return nil, fmt.Errorf("trouble parsing geometry coordinates: %w", err) - } - feature := &Feature{ - Geometry: geometry.Geometry(), - Properties: map[string]any{}, - } - return feature, nil -} - -func (r *FeatureReader) readFeature() (*Feature, error) { - if !r.decoder.More() { - r.decoder = nil - return nil, io.EOF - } - feature := &Feature{} - if err := r.decoder.Decode(feature); err != nil { - return nil, err - } - return feature, nil -} - -func (r *FeatureReader) readGeometryCollection() (*Feature, error) { - feature := &Feature{Properties: map[string]any{}} - - if !r.decoder.More() { - return feature, nil - } - - geometries := []orb.Geometry{} - for r.decoder.More() { - geometry := &orbjson.Geometry{} - if err := r.decoder.Decode(geometry); err != nil { - return nil, fmt.Errorf("trouble parsing geometry: %w", err) + if err := jsonWriter.Write(record); err != nil { + return err } - geometries = append(geometries, geometry.Geometry()) } - feature.Geometry = orb.Collection(geometries) - return feature, nil + return jsonWriter.Close() } type ConvertOptions struct { @@ -519,112 +67,99 @@ type ConvertOptions struct { var defaultOptions = &ConvertOptions{ MinFeatures: 1, MaxFeatures: 50, - Compression: "gzip", + Compression: "zstd", } func ToParquet(input io.Reader, output io.Writer, convertOptions *ConvertOptions) error { - reader := NewFeatureReader(input) - if convertOptions == nil { convertOptions = defaultOptions } + reader := NewFeatureReader(input) + buffer := []*geo.Feature{} + builder := pqutil.NewArrowSchemaBuilder() + featuresRead := 0 - minFeatures := convertOptions.MinFeatures - if minFeatures == 0 { - minFeatures = defaultOptions.MinFeatures - } - - maxFeatures := convertOptions.MaxFeatures - if maxFeatures == 0 { - maxFeatures = defaultOptions.MaxFeatures - } - - converter, converterErr := reader.Converter(minFeatures, maxFeatures) - if converterErr != nil { - return converterErr - } - - schema := parquet.SchemaOf(reflect.New(converter.Type).Elem().Interface()) - - compression := convertOptions.Compression - if compression == "" { - compression = defaultOptions.Compression + var pqWriterProps *parquet.WriterProperties + if convertOptions.Compression != "" { + compression, err := pqutil.GetCompression(convertOptions.Compression) + if err != nil { + return err + } + pqWriterProps = parquet.NewWriterProperties(parquet.WithCompression(compression)) } - codec, codecErr := geoparquet.GetCodec(compression) - if codecErr != nil { - return codecErr - } - options := []parquet.WriterOption{ - parquet.Compression(codec), - schema, - } + var featureWriter *geoparquet.FeatureWriter + writeBuffered := func() error { + if !builder.Ready() { + return fmt.Errorf("failed to create schema after reading %d features", len(buffer)) + } + if err := builder.AddGeometry(geoparquet.DefaultGeometryColumn, geoparquet.DefaultGeometryEncoding); err != nil { + return err + } + sc, scErr := builder.Schema() + if scErr != nil { + return scErr + } + fw, fwErr := geoparquet.NewFeatureWriter(&geoparquet.WriterConfig{ + Writer: output, + ArrowSchema: sc, + ParquetWriterProps: pqWriterProps, + }) + if fwErr != nil { + return fwErr + } - writerConfig, configErr := parquet.NewWriterConfig(options...) - if configErr != nil { - return configErr + for _, buffered := range buffer { + if err := fw.Write(buffered); err != nil { + return err + } + } + featureWriter = fw + return nil } - writer := parquet.NewGenericWriter[any](output, writerConfig) - - var bounds *orb.Bound - geometryTypeLookup := map[string]bool{} - - metadataString := convertOptions.Metadata - for { - feature, err := reader.Next() + feature, err := reader.Read() if err == io.EOF { break } if err != nil { return err } - - if metadataString == "" && feature.Geometry != nil { - b := feature.Geometry.Bound() - if bounds == nil { - bounds = &b - } else { - b = b.Union(*bounds) - bounds = &b + featuresRead += 1 + if featureWriter == nil { + if err := builder.Add(feature.Properties); err != nil { + return err } - geometryTypeLookup[feature.Geometry.GeoJSONType()] = true - } - row, err := converter.Convert(feature) - if err != nil { - return err - } - _, writeErr := writer.Write([]any{row}) - if writeErr != nil { - return writeErr - } - } + if !builder.Ready() { + buffer = append(buffer, feature) + if len(buffer) > convertOptions.MaxFeatures { + return fmt.Errorf("failed to create parquet schema after reading %d features", convertOptions.MaxFeatures) + } + continue + } - if metadataString == "" { - metadata := GetDefaultMetadata() - if bounds != nil { - metadata.Columns[metadata.PrimaryColumn].Bounds = []float64{ - bounds.Left(), bounds.Bottom(), bounds.Right(), bounds.Top(), + if len(buffer) < convertOptions.MinFeatures-1 { + buffer = append(buffer, feature) + continue } - } - geometryTypes := []string{} - if len(geometryTypeLookup) > 0 { - for geometryType := range geometryTypeLookup { - geometryTypes = append(geometryTypes, geometryType) + if err := writeBuffered(); err != nil { + return err } } - metadata.Columns[metadata.PrimaryColumn].GeometryTypes = geometryTypes - - metadataBytes, jsonErr := json.Marshal(metadata) - if jsonErr != nil { - return fmt.Errorf("failed to serialize geo metadata: %w", jsonErr) + if err := featureWriter.Write(feature); err != nil { + return err } - metadataString = string(metadataBytes) } - - writer.SetKeyValueMetadata(geoparquet.MetadataKey, metadataString) - return writer.Close() + if featuresRead > 0 { + if featureWriter == nil { + if err := writeBuffered(); err != nil { + return err + } + } + return featureWriter.Close() + } + return nil } diff --git a/internal/geojson/geojson_test.go b/internal/geojson/geojson_test.go index 2ce0503..cc4fc82 100644 --- a/internal/geojson/geojson_test.go +++ b/internal/geojson/geojson_test.go @@ -16,16 +16,24 @@ package geojson_test import ( "bytes" - "fmt" - "io" + "encoding/json" "os" + "strings" "testing" + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/compress" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/pqarrow" + "github.com/apache/arrow/go/v14/parquet/schema" "github.com/paulmach/orb" "github.com/paulmach/orb/encoding/wkb" + "github.com/planetlabs/gpq/internal/geo" "github.com/planetlabs/gpq/internal/geojson" "github.com/planetlabs/gpq/internal/geoparquet" - "github.com/segmentio/parquet-go" + "github.com/planetlabs/gpq/internal/pqutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -35,14 +43,8 @@ func TestFromParquetv040(t *testing.T) { reader, openErr := os.Open(input) require.NoError(t, openErr) - info, statErr := os.Stat(input) - require.NoError(t, statErr) - - file, fileErr := parquet.OpenFile(reader, info.Size()) - require.NoError(t, fileErr) - buffer := &bytes.Buffer{} - convertErr := geojson.FromParquet(file, buffer) + convertErr := geojson.FromParquet(reader, buffer) assert.NoError(t, convertErr) expected, err := os.ReadFile("testdata/example.geojson") @@ -56,14 +58,8 @@ func TestFromParquetv100Beta1(t *testing.T) { reader, openErr := os.Open(input) require.NoError(t, openErr) - info, statErr := os.Stat(input) - require.NoError(t, statErr) - - file, fileErr := parquet.OpenFile(reader, info.Size()) - require.NoError(t, fileErr) - buffer := &bytes.Buffer{} - convertErr := geojson.FromParquet(file, buffer) + convertErr := geojson.FromParquet(reader, buffer) assert.NoError(t, convertErr) expected, err := os.ReadFile("testdata/example.geojson") @@ -72,134 +68,6 @@ func TestFromParquetv100Beta1(t *testing.T) { assert.JSONEq(t, string(expected), buffer.String()) } -func TestFeatureReader(t *testing.T) { - file, openErr := os.Open("testdata/example.geojson") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - features := []*geojson.Feature{} - for { - feature, err := reader.Next() - if err == io.EOF { - break - } - require.NoError(t, err) - features = append(features, feature) - } - require.Len(t, features, 5) - - fiji := features[0] - assert.NotNil(t, fiji.Geometry) - assert.Equal(t, "Oceania", fiji.Properties["continent"]) - assert.Equal(t, float64(920938), fiji.Properties["pop_est"]) - - usa := features[4] - assert.NotNil(t, usa.Geometry) - assert.Equal(t, "North America", usa.Properties["continent"]) - assert.Equal(t, float64(326625791), usa.Properties["pop_est"]) -} - -func TestFeatureReaderPointGeometry(t *testing.T) { - file, openErr := os.Open("testdata/point-geometry.geojson") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - features := []*geojson.Feature{} - for { - feature, err := reader.Next() - if err == io.EOF { - break - } - require.NoError(t, err) - features = append(features, feature) - } - require.Len(t, features, 1) - - feature := features[0] - require.NotNil(t, feature.Geometry) - assert.Equal(t, "Point", feature.Geometry.GeoJSONType()) - point, ok := feature.Geometry.(orb.Point) - require.True(t, ok) - assert.True(t, point.Equal(orb.Point{1, 2})) - assert.Len(t, feature.Properties, 0) -} - -func TestFeatureReaderSingleFeature(t *testing.T) { - file, openErr := os.Open("testdata/feature.geojson") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - features := []*geojson.Feature{} - for { - feature, err := reader.Next() - if err == io.EOF { - break - } - require.NoError(t, err) - features = append(features, feature) - } - require.Len(t, features, 1) - - feature := features[0] - require.NotNil(t, feature.Geometry) - assert.Equal(t, "Point", feature.Geometry.GeoJSONType()) - point, ok := feature.Geometry.(orb.Point) - require.True(t, ok) - assert.True(t, point.Equal(orb.Point{1, 2})) - assert.Equal(t, map[string]any{"name": "test"}, feature.Properties) -} - -func TestFeatureReaderEmptyFeatureCollection(t *testing.T) { - file, openErr := os.Open("testdata/empty-collection.geojson") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - feature, err := reader.Next() - assert.Nil(t, feature) - assert.Equal(t, io.EOF, err) -} - -func TestFeatureReaderBadCollection(t *testing.T) { - file, openErr := os.Open("testdata/bad-collection.geojson") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - feature, noErr := reader.Next() - assert.NotNil(t, feature) - assert.NoError(t, noErr) - - noFeature, err := reader.Next() - require.Nil(t, noFeature) - require.EqualError(t, err, "geojson: invalid geometry") -} - -func TestFeatureReaderNotGeoJSON(t *testing.T) { - file, openErr := os.Open("testdata/not-geojson.json") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - feature, err := reader.Next() - assert.Nil(t, feature) - assert.EqualError(t, err, "expected a FeatureCollection, a Feature, or a Geometry object") -} - -func TestFeatureReaderNotGeoJSONArray(t *testing.T) { - file, openErr := os.Open("testdata/array.json") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(file) - - feature, err := reader.Next() - assert.Nil(t, feature) - assert.EqualError(t, err, "expected a JSON object, got [") -} - func TestToParquet(t *testing.T) { geojsonFile, openErr := os.Open("testdata/example.geojson") require.NoError(t, openErr) @@ -209,10 +77,10 @@ func TestToParquet(t *testing.T) { assert.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) geometryTypes := metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes() @@ -225,10 +93,10 @@ func TestToParquet(t *testing.T) { gotBounds := metadata.Columns[metadata.PrimaryColumn].Bounds assert.Equal(t, []float64{-180, -18.28799, 180, 83.23324000000001}, gotBounds) - assert.Equal(t, int64(5), parquetFile.NumRows()) + assert.Equal(t, int64(5), fileReader.NumRows()) geojsonBuffer := &bytes.Buffer{} - fromParquetErr := geojson.FromParquet(parquetFile, geojsonBuffer) + fromParquetErr := geojson.FromParquet(parquetInput, geojsonBuffer) require.NoError(t, fromParquetErr) expected, err := os.ReadFile("testdata/example.geojson") @@ -243,7 +111,7 @@ func TestToParquetMismatchedTypes(t *testing.T) { parquetBuffer := &bytes.Buffer{} toParquetErr := geojson.ToParquet(geojsonFile, parquetBuffer, nil) - assert.EqualError(t, toParquetErr, "mixed types for \"stringProperty\", expected string, but got float64") + assert.EqualError(t, toParquetErr, "expected \"stringProperty\" to be a string, got 42") } func TestToParquetRepeatedProps(t *testing.T) { @@ -255,20 +123,23 @@ func TestToParquetRepeatedProps(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - schema := parquetFile.Schema() + sc := fileReader.MetaData().Schema - numbers, ok := schema.Lookup("numbers") + numbers, ok := pqutil.LookupListElementNode(sc, "numbers") require.True(t, ok) - assert.True(t, numbers.Node.Repeated()) - assert.Equal(t, parquet.DoubleType, numbers.Node.Type()) - strings, ok := schema.Lookup("strings") + assert.Equal(t, parquet.Repetitions.Optional, numbers.RepetitionType()) + assert.Equal(t, parquet.Types.Double, numbers.PhysicalType()) + + strings, ok := pqutil.LookupListElementNode(sc, "strings") require.True(t, ok) - assert.True(t, strings.Node.Repeated()) - assert.Equal(t, parquet.String().Type(), strings.Node.Type()) + + assert.Equal(t, parquet.Repetitions.Optional, strings.RepetitionType()) + assert.Equal(t, parquet.Types.ByteArray, strings.PhysicalType()) + assert.Equal(t, schema.StringLogicalType{}, strings.LogicalType()) } func TestToParquetNullGeometry(t *testing.T) { @@ -280,20 +151,20 @@ func TestToParquetNullGeometry(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - schema := parquetFile.Schema() + sc := fileReader.MetaData().Schema - place, ok := schema.Lookup("place") + place, ok := pqutil.LookupPrimitiveNode(sc, "place") require.True(t, ok) - assert.True(t, place.Node.Optional()) - assert.Equal(t, parquet.String().Type(), place.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, place.RepetitionType()) + assert.Equal(t, schema.StringLogicalType{}, place.LogicalType()) - geometry, ok := schema.Lookup("geometry") + geometry, ok := pqutil.LookupPrimitiveNode(sc, "geometry") require.True(t, ok) - assert.True(t, geometry.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geometry.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, geometry.RepetitionType()) + assert.Equal(t, parquet.Types.ByteArray, geometry.PhysicalType()) } func TestToParquetAllNullGeometry(t *testing.T) { @@ -305,30 +176,30 @@ func TestToParquetAllNullGeometry(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) assert.Len(t, metadata.Columns[metadata.PrimaryColumn].GeometryTypes, 0) assert.Nil(t, metadata.Columns[metadata.PrimaryColumn].GeometryType) assert.Len(t, metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes(), 0) - schema := parquetFile.Schema() + sc := fileReader.MetaData().Schema - place, ok := schema.Lookup("place") + place, ok := pqutil.LookupPrimitiveNode(sc, "place") require.True(t, ok) - assert.True(t, place.Node.Optional()) - assert.Equal(t, parquet.String().Type(), place.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, place.RepetitionType()) + assert.Equal(t, schema.StringLogicalType{}, place.LogicalType()) - geometry, ok := schema.Lookup("geometry") + geometry, ok := pqutil.LookupPrimitiveNode(sc, "geometry") require.True(t, ok) - assert.True(t, geometry.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geometry.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, geometry.RepetitionType()) + assert.Equal(t, parquet.Types.ByteArray, geometry.PhysicalType()) } -func TestToParqueStringId(t *testing.T) { +func TestToParquetStringId(t *testing.T) { geojsonFile, openErr := os.Open("testdata/string-id.geojson") require.NoError(t, openErr) @@ -337,17 +208,17 @@ func TestToParqueStringId(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) geometryTypes := metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes() assert.Equal(t, []string{"Point"}, geometryTypes) } -func TestToParqueNumberId(t *testing.T) { +func TestToParquetNumberId(t *testing.T) { geojsonFile, openErr := os.Open("testdata/number-id.geojson") require.NoError(t, openErr) @@ -356,17 +227,17 @@ func TestToParqueNumberId(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) geometryTypes := metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes() assert.Equal(t, []string{"Point"}, geometryTypes) } -func TestToParqueBooleanId(t *testing.T) { +func TestToParquetBooleanId(t *testing.T) { geojsonFile, openErr := os.Open("testdata/boolean-id.geojson") require.NoError(t, openErr) @@ -375,7 +246,7 @@ func TestToParqueBooleanId(t *testing.T) { assert.ErrorContains(t, toParquetErr, "expected id to be a string or number, got: true") } -func TestToParqueArrayId(t *testing.T) { +func TestToParquetArrayId(t *testing.T) { geojsonFile, openErr := os.Open("testdata/array-id.geojson") require.NoError(t, openErr) @@ -384,7 +255,7 @@ func TestToParqueArrayId(t *testing.T) { assert.ErrorContains(t, toParquetErr, "expected id to be a string or number, got: [") } -func TestToParqueObjectId(t *testing.T) { +func TestToParquetObjectId(t *testing.T) { geojsonFile, openErr := os.Open("testdata/object-id.geojson") require.NoError(t, openErr) @@ -402,10 +273,10 @@ func TestToParquetWithCRS(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) geometryTypes := metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes() @@ -421,26 +292,26 @@ func TestToParquetExtraArray(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) geometryTypes := metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes() assert.Equal(t, []string{"Point"}, geometryTypes) - schema := parquetFile.Schema() + sc := fileReader.MetaData().Schema - place, ok := schema.Lookup("name") + place, ok := pqutil.LookupPrimitiveNode(sc, "name") require.True(t, ok) - assert.True(t, place.Node.Optional()) - assert.Equal(t, parquet.String().Type(), place.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, place.RepetitionType()) + assert.Equal(t, schema.StringLogicalType{}, place.LogicalType()) - geometry, ok := schema.Lookup("geometry") + geometry, ok := pqutil.LookupPrimitiveNode(sc, "geometry") require.True(t, ok) - assert.True(t, geometry.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geometry.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, geometry.RepetitionType()) + assert.Equal(t, parquet.Types.ByteArray, geometry.PhysicalType()) } func TestToParquetExtraObject(t *testing.T) { @@ -452,26 +323,26 @@ func TestToParquetExtraObject(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - metadata, geoErr := geoparquet.GetMetadata(parquetFile) + metadata, geoErr := geoparquet.GetMetadata(fileReader.MetaData().KeyValueMetadata()) require.NoError(t, geoErr) geometryTypes := metadata.Columns[metadata.PrimaryColumn].GetGeometryTypes() assert.Equal(t, []string{"Point"}, geometryTypes) - schema := parquetFile.Schema() + sc := fileReader.MetaData().Schema - place, ok := schema.Lookup("name") + place, ok := pqutil.LookupPrimitiveNode(sc, "name") require.True(t, ok) - assert.True(t, place.Node.Optional()) - assert.Equal(t, parquet.String().Type(), place.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, place.RepetitionType()) + assert.Equal(t, schema.StringLogicalType{}, place.LogicalType()) - geometry, ok := schema.Lookup("geometry") + geometry, ok := pqutil.LookupPrimitiveNode(sc, "geometry") require.True(t, ok) - assert.True(t, geometry.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geometry.Node.Type()) + assert.Equal(t, parquet.Repetitions.Optional, geometry.RepetitionType()) + assert.Equal(t, parquet.Types.ByteArray, geometry.PhysicalType()) } func TestRoundTripRepeatedProps(t *testing.T) { @@ -485,11 +356,9 @@ func TestRoundTripRepeatedProps(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) jsonBuffer := &bytes.Buffer{} - convertErr := geojson.FromParquet(parquetFile, jsonBuffer) + convertErr := geojson.FromParquet(parquetInput, jsonBuffer) require.NoError(t, convertErr) assert.JSONEq(t, string(inputData), jsonBuffer.String()) @@ -506,11 +375,9 @@ func TestRoundTripNestedProps(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) jsonBuffer := &bytes.Buffer{} - convertErr := geojson.FromParquet(parquetFile, jsonBuffer) + convertErr := geojson.FromParquet(parquetInput, jsonBuffer) require.NoError(t, convertErr) assert.JSONEq(t, string(inputData), jsonBuffer.String()) @@ -527,11 +394,9 @@ func TestRoundTripNullGeometry(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) jsonBuffer := &bytes.Buffer{} - convertErr := geojson.FromParquet(parquetFile, jsonBuffer) + convertErr := geojson.FromParquet(parquetInput, jsonBuffer) require.NoError(t, convertErr) assert.JSONEq(t, string(inputData), jsonBuffer.String()) @@ -548,37 +413,59 @@ func TestRoundTripSparseProperties(t *testing.T) { require.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) jsonBuffer := &bytes.Buffer{} - convertErr := geojson.FromParquet(parquetFile, jsonBuffer) + convertErr := geojson.FromParquet(parquetInput, jsonBuffer) require.NoError(t, convertErr) assert.JSONEq(t, string(inputData), jsonBuffer.String()) } -func makeGeoParquet[T any](rows []T, metadata *geoparquet.Metadata) (*parquet.File, error) { - data := &bytes.Buffer{} - writer := geoparquet.NewGenericWriter[T](data, metadata) +func makeGeoParquetReader[T any](rows []T, metadata *geoparquet.Metadata) (*bytes.Reader, error) { + data, err := json.Marshal(rows) + if err != nil { + return nil, err + } + + parquetSchema, err := schema.NewSchemaFromStruct(rows[0]) + if err != nil { + return nil, err + } - _, writeErr := writer.Write(rows) - if writeErr != nil { - return nil, fmt.Errorf("trouble writing rows: %w", writeErr) + arrowSchema, err := pqarrow.FromParquet(parquetSchema, nil, nil) + if err != nil { + return nil, err } - closeErr := writer.Close() - if closeErr != nil { - return nil, fmt.Errorf("trouble closing writer: %w", closeErr) + output := &bytes.Buffer{} + recordWriter, err := geoparquet.NewRecordWriter(&geoparquet.WriterConfig{ + Writer: output, + Metadata: metadata, + ArrowSchema: arrowSchema, + }) + if err != nil { + return nil, err } - return parquet.OpenFile(bytes.NewReader(data.Bytes()), int64(data.Len())) + rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, arrowSchema, strings.NewReader(string(data))) + if err != nil { + return nil, err + } + + if err := recordWriter.Write(rec); err != nil { + return nil, err + } + if err := recordWriter.Close(); err != nil { + return nil, err + } + + return bytes.NewReader(output.Bytes()), nil } func TestWKT(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geometry string `parquet:"geometry"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry string `parquet:"name=geometry, logical=String" json:"geometry"` } rows := []*Row{ @@ -593,13 +480,13 @@ func TestWKT(t *testing.T) { } metadata := geoparquet.DefaultMetadata() - metadata.Columns[metadata.PrimaryColumn].Encoding = geoparquet.EncodingWKT + metadata.Columns[metadata.PrimaryColumn].Encoding = geo.EncodingWKT - file, fileErr := makeGeoParquet(rows, metadata) - require.NoError(t, fileErr) + reader, readerErr := makeGeoParquetReader(rows, metadata) + require.NoError(t, readerErr) output := &bytes.Buffer{} - convertErr := geojson.FromParquet(file, output) + convertErr := geojson.FromParquet(reader, output) require.NoError(t, convertErr) expected := `{ @@ -633,8 +520,8 @@ func TestWKT(t *testing.T) { func TestWKTNoEncoding(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geometry string `parquet:"geometry"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry string `parquet:"name=geometry, logical=String" json:"geometry"` } rows := []*Row{ @@ -647,11 +534,11 @@ func TestWKTNoEncoding(t *testing.T) { metadata := geoparquet.DefaultMetadata() metadata.Columns[metadata.PrimaryColumn].Encoding = "" - file, fileErr := makeGeoParquet(rows, metadata) - require.NoError(t, fileErr) + reader, readerErr := makeGeoParquetReader(rows, metadata) + require.NoError(t, readerErr) output := &bytes.Buffer{} - convertErr := geojson.FromParquet(file, output) + convertErr := geojson.FromParquet(reader, output) require.NoError(t, convertErr) expected := `{ @@ -675,8 +562,8 @@ func TestWKTNoEncoding(t *testing.T) { func TestWKB(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geometry []byte `parquet:"geometry"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry []byte `parquet:"name=geometry" json:"geometry"` } point, pointErr := wkb.Marshal(orb.Point{1, 2}) @@ -691,11 +578,11 @@ func TestWKB(t *testing.T) { metadata := geoparquet.DefaultMetadata() - file, fileErr := makeGeoParquet(rows, metadata) - require.NoError(t, fileErr) + reader, readerErr := makeGeoParquetReader(rows, metadata) + require.NoError(t, readerErr) output := &bytes.Buffer{} - convertErr := geojson.FromParquet(file, output) + convertErr := geojson.FromParquet(reader, output) require.NoError(t, convertErr) expected := `{ @@ -719,8 +606,8 @@ func TestWKB(t *testing.T) { func TestWKBNoEncoding(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geometry []byte `parquet:"geometry"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry []byte `parquet:"name=geometry" json:"geometry"` } point, pointErr := wkb.Marshal(orb.Point{1, 2}) @@ -736,11 +623,11 @@ func TestWKBNoEncoding(t *testing.T) { metadata := geoparquet.DefaultMetadata() metadata.Columns[metadata.PrimaryColumn].Encoding = "" - file, fileErr := makeGeoParquet(rows, metadata) - require.NoError(t, fileErr) + reader, readerErr := makeGeoParquetReader(rows, metadata) + require.NoError(t, readerErr) output := &bytes.Buffer{} - convertErr := geojson.FromParquet(file, output) + convertErr := geojson.FromParquet(reader, output) require.NoError(t, convertErr) expected := `{ @@ -772,10 +659,10 @@ func TestCodecUncompressed(t *testing.T) { assert.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - assert.Equal(t, parquet.Uncompressed.CompressionCodec(), parquetFile.Metadata().RowGroups[0].Columns[0].MetaData.Codec) + assert.Equal(t, compress.Codecs.Uncompressed, compress.Compression(fileReader.MetaData().RowGroups[0].Columns[0].MetaData.Codec)) } func TestCodecSnappy(t *testing.T) { @@ -788,10 +675,10 @@ func TestCodecSnappy(t *testing.T) { assert.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - assert.Equal(t, parquet.Snappy.CompressionCodec(), parquetFile.Metadata().RowGroups[0].Columns[0].MetaData.Codec) + assert.Equal(t, compress.Codecs.Snappy, compress.Compression(fileReader.MetaData().RowGroups[0].Columns[0].MetaData.Codec)) } func TestCodecGzip(t *testing.T) { @@ -804,10 +691,10 @@ func TestCodecGzip(t *testing.T) { assert.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - assert.Equal(t, parquet.Gzip.CompressionCodec(), parquetFile.Metadata().RowGroups[0].Columns[0].MetaData.Codec) + assert.Equal(t, compress.Codecs.Gzip, compress.Compression(fileReader.MetaData().RowGroups[0].Columns[0].MetaData.Codec)) } func TestCodecBrotli(t *testing.T) { @@ -820,10 +707,10 @@ func TestCodecBrotli(t *testing.T) { assert.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - assert.Equal(t, parquet.Brotli.CompressionCodec(), parquetFile.Metadata().RowGroups[0].Columns[0].MetaData.Codec) + assert.Equal(t, compress.Codecs.Brotli, compress.Compression(fileReader.MetaData().RowGroups[0].Columns[0].MetaData.Codec)) } func TestCodecZstd(t *testing.T) { @@ -836,26 +723,10 @@ func TestCodecZstd(t *testing.T) { assert.NoError(t, toParquetErr) parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) - - assert.Equal(t, parquet.Zstd.CompressionCodec(), parquetFile.Metadata().RowGroups[0].Columns[0].MetaData.Codec) -} - -func TestCodecLz4raw(t *testing.T) { - geojsonFile, openErr := os.Open("testdata/example.geojson") - require.NoError(t, openErr) - - parquetBuffer := &bytes.Buffer{} - convertOptions := &geojson.ConvertOptions{Compression: "lz4raw"} - toParquetErr := geojson.ToParquet(geojsonFile, parquetBuffer, convertOptions) - assert.NoError(t, toParquetErr) - - parquetInput := bytes.NewReader(parquetBuffer.Bytes()) - parquetFile, openErr := parquet.OpenFile(parquetInput, parquetInput.Size()) - require.NoError(t, openErr) + fileReader, fileErr := file.NewParquetReader(parquetInput) + require.NoError(t, fileErr) - assert.Equal(t, parquet.Lz4Raw.CompressionCodec(), parquetFile.Metadata().RowGroups[0].Columns[0].MetaData.Codec) + assert.Equal(t, compress.Codecs.Zstd, compress.Compression(fileReader.MetaData().RowGroups[0].Columns[0].MetaData.Codec)) } func TestCodecInvalid(t *testing.T) { diff --git a/internal/geojson/metadata.go b/internal/geojson/metadata.go deleted file mode 100644 index 9efde1a..0000000 --- a/internal/geojson/metadata.go +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Planet Labs PBC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package geojson - -import ( - "github.com/planetlabs/gpq/internal/geoparquet" -) - -const primaryColumn = "geometry" - -func GetDefaultMetadata() *geoparquet.Metadata { - return &geoparquet.Metadata{ - Version: geoparquet.Version, - PrimaryColumn: primaryColumn, - Columns: map[string]*geoparquet.GeometryColumn{ - primaryColumn: { - Encoding: "WKB", - GeometryTypes: []string{}, - }, - }, - } -} diff --git a/internal/geojson/recordwriter.go b/internal/geojson/recordwriter.go new file mode 100644 index 0000000..dcbbd15 --- /dev/null +++ b/internal/geojson/recordwriter.go @@ -0,0 +1,104 @@ +package geojson + +import ( + "encoding/json" + "io" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/array" + orbjson "github.com/paulmach/orb/geojson" + "github.com/planetlabs/gpq/internal/geo" + "github.com/planetlabs/gpq/internal/geoparquet" +) + +type RecordWriter struct { + geoMetadata *geoparquet.Metadata + writer io.Writer + writing bool +} + +func NewRecordWriter(writer io.Writer, geoMetadata *geoparquet.Metadata) (*RecordWriter, error) { + w := &RecordWriter{writer: writer, geoMetadata: geoMetadata} + return w, nil +} + +var ( + featureCollectionPrefix = []byte(`{"type":"FeatureCollection","features":[`) + arraySeparator = []byte(",") + featureCollectionSuffix = []byte("]}") +) + +func (w *RecordWriter) Write(record arrow.Record) error { + if !w.writing { + if _, err := w.writer.Write(featureCollectionPrefix); err != nil { + return err + } + w.writing = true + } else { + if _, err := w.writer.Write(arraySeparator); err != nil { + return err + } + } + arr := array.RecordToStructArray(record) + defer arr.Release() + + schema := record.Schema() + for rowNum := 0; rowNum < arr.Len(); rowNum += 1 { + if rowNum > 0 { + if _, err := w.writer.Write(arraySeparator); err != nil { + return err + } + } + + var geometry *orbjson.Geometry + properties := map[string]any{} + for fieldNum := 0; fieldNum < arr.NumField(); fieldNum += 1 { + value := arr.Field(fieldNum).GetOneForMarshal(rowNum) + name := schema.Field(fieldNum).Name + if geomColumn, ok := w.geoMetadata.Columns[name]; ok { + g, decodeErr := geo.DecodeGeometry(value, geomColumn.Encoding) + if decodeErr != nil { + return decodeErr + } + if name == w.geoMetadata.PrimaryColumn { + geometry = g + continue + } + properties[name] = g + continue + } + properties[name] = value + } + + feature := map[string]any{ + "type": "Feature", + "properties": properties, + "geometry": geometry, + } + + featureData, jsonErr := json.Marshal(feature) + if jsonErr != nil { + return jsonErr + } + if _, err := w.writer.Write(featureData); err != nil { + return err + } + } + + return nil +} + +func (w *RecordWriter) Close() error { + if w.writing { + if _, err := w.writer.Write(featureCollectionSuffix); err != nil { + return err + } + w.writing = false + } + + closer, ok := w.writer.(io.Closer) + if ok { + return closer.Close() + } + return nil +} diff --git a/internal/geojson/schema.go b/internal/geojson/schema.go deleted file mode 100644 index 5e70d93..0000000 --- a/internal/geojson/schema.go +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright 2023 Planet Labs PBC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package geojson - -import ( - "errors" - "fmt" - "reflect" - "strings" - "unicode" - - "github.com/paulmach/orb" - "github.com/paulmach/orb/encoding/wkb" - "github.com/segmentio/parquet-go" -) - -type ConvertFn func(any) (any, error) - -var identity ConvertFn = func(v any) (any, error) { - return v, nil -} - -type TypeConverter struct { - Type reflect.Type - Convert ConvertFn -} - -func typeConverterFromAny(v any) (*TypeConverter, error) { - if v == nil { - return nil, errors.New("cannot determine type from null") - } - - switch value := v.(type) { - case bool, float64, string: - converter := &TypeConverter{ - Type: reflect.TypeOf(value), - Convert: identity, - } - return converter, nil - case map[string]any: - return typeConverterFromMap(value) - case []any: - return typeConverterFromSlice(value) - default: - return nil, fmt.Errorf("unsupported type: %t", value) - } -} - -func typeConverterFromSlice(data []any) (*TypeConverter, error) { - if len(data) == 0 { - return nil, errors.New("cannot determine type from empty array") - } - - itemConverter, err := typeConverterFromAny(data[0]) - if err != nil { - return nil, err - } - - itemType := itemConverter.Type - - for _, v := range data[1:] { - _, err := itemConverter.Convert(v) - if err != nil { - return nil, fmt.Errorf("unsupported array of mixed type: %w", err) - } - } - - converter := &TypeConverter{ - Type: reflect.SliceOf(itemType), - Convert: func(v any) (any, error) { - data, ok := v.([]any) - if !ok { - return nil, fmt.Errorf("expected []any, got %t", v) - } - slice := reflect.MakeSlice(reflect.SliceOf(itemType), len(data), len(data)) - for i, d := range data { - value, err := itemConverter.Convert(d) - if err != nil { - return nil, err - } - itemValue := reflect.ValueOf(value) - if itemValue.Type() != itemType { - return nil, fmt.Errorf("mixed array, expected %s, but got %s", itemType, itemValue.Type()) - } - slice.Index(i).Set(itemValue) - } - return slice.Interface(), nil - }, - } - - return converter, nil -} - -type FieldConverter struct { - Field reflect.StructField - Convert ConvertFn -} - -func typeConverterFromMap(data map[string]any) (*TypeConverter, error) { - fieldConverters, err := fieldConvertersFromMap(data) - if err != nil { - return nil, err - } - return structConverter(fieldConverters) -} - -func structConverter(fieldConverters map[string]*FieldConverter) (*TypeConverter, error) { - convertLookup := map[string]ConvertFn{} - nameLookup := map[string]string{} - - fields := []reflect.StructField{} - for key, fieldConverter := range fieldConverters { - fields = append(fields, fieldConverter.Field) - convertLookup[key] = fieldConverter.Convert - nameLookup[key] = fieldConverter.Field.Name - } - - structType := reflect.StructOf(fields) - - converter := &TypeConverter{ - Type: structType, - Convert: func(d any) (any, error) { - data, ok := d.(map[string]any) - if !ok { - return nil, fmt.Errorf("expected map[string]any, got %t", d) - } - - structValue := reflect.New(structType).Elem() - for k, v := range data { - convert, ok := convertLookup[k] - if !ok { - return nil, fmt.Errorf("unexpected property name %q", k) - } - name, ok := nameLookup[k] - if !ok { - return nil, fmt.Errorf("unexpected property name %q", k) - } - if v == nil { - continue - } - - value, err := convert(v) - if err != nil { - return nil, fmt.Errorf("unable to convert value %v for %q: %w", v, k, err) - } - - fieldValue := structValue.FieldByName(name) - if fieldValue.Type() != reflect.TypeOf(value) { - return nil, fmt.Errorf("mixed types for %q, expected %s, but got %s", k, fieldValue.Type(), reflect.TypeOf(value)) - } - fieldValue.Set(reflect.ValueOf(value)) - } - return structValue.Interface(), nil - }, - } - return converter, nil -} - -func fieldName(key string, offset int) string { - letters := []rune("GPQ_") - for _, r := range strings.ToUpper(key) { - if !(unicode.IsLetter(r) || unicode.IsNumber(r)) { - r = '_' - } - letters = append(letters, r) - } - return fmt.Sprintf("%s_%d", string(letters), offset) -} - -func fieldConverterFromAny(key string, offset int, value any) (*FieldConverter, error) { - typeConverter, err := typeConverterFromAny(value) - if err != nil { - return nil, err - } - - repetition := "optional" - if typeConverter.Type.Kind() == reflect.Slice { - repetition = "" - } - - fieldConverter := &FieldConverter{ - Field: reflect.StructField{ - Name: fieldName(key, offset), - Type: typeConverter.Type, - Tag: makeStructTag("parquet", key, repetition), - }, - Convert: typeConverter.Convert, - } - - return fieldConverter, nil -} - -func fieldConvertersFromMap(data map[string]any) (map[string]*FieldConverter, error) { - fieldConverters := map[string]*FieldConverter{} - for key, v := range data { - fieldConverter, err := fieldConverterFromAny(key, len(fieldConverters), v) - if err != nil { - return nil, err - } - - fieldConverters[key] = fieldConverter - } - return fieldConverters, nil -} - -func makeStructTag(name string, values ...string) reflect.StructTag { - nonEmptyValues := []string{} - for _, value := range values { - if value == "" { - continue - } - nonEmptyValues = append(nonEmptyValues, value) - } - return reflect.StructTag(fmt.Sprintf("%s:%q", name, strings.Join(nonEmptyValues, ","))) -} - -type SchemaBuilder struct { - fieldConverters map[string]*FieldConverter - lastError error -} - -func (sb *SchemaBuilder) Error() error { - return sb.lastError -} - -func (sb *SchemaBuilder) isComplete() bool { - if sb.fieldConverters == nil { - return false - } - for _, v := range sb.fieldConverters { - if v == nil { - return false - } - } - return true -} - -func (sb *SchemaBuilder) Add(feature *Feature) bool { - if sb.fieldConverters == nil { - sb.fieldConverters = map[string]*FieldConverter{} - } - - fieldConverters := sb.fieldConverters - for key, value := range feature.Properties { - if value == nil { - if _, ok := fieldConverters[key]; !ok { - if sb.lastError == nil { - sb.lastError = fmt.Errorf("null value for %q", key) - } - fieldConverters[key] = nil - } - continue - } - - if fieldConverters[key] != nil { - continue - } - - fieldConverter, err := fieldConverterFromAny(key, len(fieldConverters), value) - if err != nil { - sb.lastError = err - fieldConverters[key] = nil - continue - } - - fieldConverters[key] = fieldConverter - } - - if fieldConverters[primaryColumn] != nil { - return sb.isComplete() - } - - geometryData, wkbErr := wkb.Marshal(feature.Geometry) - if wkbErr != nil { - fieldConverters[primaryColumn] = nil - sb.lastError = fmt.Errorf("failed to encode geometry: %w", wkbErr) - return false - } - - fieldConverters[primaryColumn] = &FieldConverter{ - Field: reflect.StructField{ - Name: fieldName(primaryColumn, len(fieldConverters)), - Type: reflect.TypeOf(geometryData), - Tag: makeStructTag("parquet", primaryColumn, "optional"), - }, - Convert: func(v any) (any, error) { - geometry, ok := v.(orb.Geometry) - if !ok { - return nil, fmt.Errorf("expected geometry, got %t", v) - } - return wkb.Marshal(geometry) - }, - } - - return sb.isComplete() -} - -func (sb *SchemaBuilder) Converter() (*TypeConverter, error) { - if !sb.isComplete() { - if err := sb.Error(); err != nil { - return nil, err - } - - return nil, errors.New("not enough features have been added to build a schema") - } - - converter, converterErr := structConverter(sb.fieldConverters) - if converterErr != nil { - return nil, converterErr - } - - featureConverter := &TypeConverter{ - Type: converter.Type, - Convert: func(f any) (any, error) { - feature, ok := f.(*Feature) - if !ok { - return nil, fmt.Errorf("expected feature, got %t", f) - } - data := map[string]any{} - for k, v := range feature.Properties { - data[k] = v - } - data[primaryColumn] = feature.Geometry - return converter.Convert(data) - }, - } - return featureConverter, nil -} - -func (sb *SchemaBuilder) Schema() (*parquet.Schema, error) { - converter, err := sb.Converter() - if err != nil { - return nil, err - } - - schema := parquet.SchemaOf(reflect.New(converter.Type).Elem().Interface()) - return schema, nil -} - -func SchemaOf(feature *Feature) (*parquet.Schema, error) { - schemaBuilder := &SchemaBuilder{} - schemaBuilder.Add(feature) - return schemaBuilder.Schema() -} diff --git a/internal/geojson/schema_test.go b/internal/geojson/schema_test.go deleted file mode 100644 index 0d7b5ab..0000000 --- a/internal/geojson/schema_test.go +++ /dev/null @@ -1,350 +0,0 @@ -// Copyright 2023 Planet Labs PBC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package geojson_test - -import ( - "os" - "testing" - - "github.com/paulmach/orb" - "github.com/planetlabs/gpq/internal/geojson" - "github.com/segmentio/parquet-go" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestSchemaOf(t *testing.T) { - input, openErr := os.Open("testdata/example.geojson") - require.NoError(t, openErr) - - reader := geojson.NewFeatureReader(input) - feature, readErr := reader.Next() - require.NoError(t, readErr) - - schema, schemaErr := geojson.SchemaOf(feature) - require.NoError(t, schemaErr) - require.Len(t, schema.Fields(), 6) - - continent, ok := schema.Lookup("continent") - require.True(t, ok) - assert.True(t, continent.Node.Optional()) - assert.Equal(t, parquet.String().Type(), continent.Node.Type()) - - name, ok := schema.Lookup("name") - require.True(t, ok) - assert.True(t, name.Node.Optional()) - assert.Equal(t, parquet.String().Type(), name.Node.Type()) - - iso, ok := schema.Lookup("iso_a3") - require.True(t, ok) - assert.True(t, iso.Node.Optional()) - assert.Equal(t, parquet.String().Type(), iso.Node.Type()) - - gdp, ok := schema.Lookup("gdp_md_est") - require.True(t, ok) - assert.True(t, gdp.Node.Optional()) - assert.Equal(t, parquet.DoubleType, gdp.Node.Type()) - - pop, ok := schema.Lookup("pop_est") - require.True(t, ok) - assert.True(t, pop.Node.Optional()) - assert.Equal(t, parquet.DoubleType, pop.Node.Type()) - - geometry, ok := schema.Lookup("geometry") - require.True(t, ok) - assert.True(t, geometry.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geometry.Node.Type()) -} - -func TestSchemaOfArrayOfStrings(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Properties: map[string]any{ - path: []any{"one", "two"}, - }, - } - - schema, schemaErr := geojson.SchemaOf(feature) - require.NoError(t, schemaErr) - - column, ok := schema.Lookup(path) - require.True(t, ok) - - assert.True(t, column.Node.Repeated()) - assert.Equal(t, parquet.String().Type(), column.Node.Type()) -} - -func TestSchemaOfArrayOfNumbers(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Properties: map[string]any{ - path: []any{float64(42), float64(21)}, - }, - } - - schema, schemaErr := geojson.SchemaOf(feature) - require.NoError(t, schemaErr) - - column, ok := schema.Lookup(path) - require.True(t, ok) - - assert.True(t, column.Node.Repeated()) - assert.Equal(t, parquet.DoubleType, column.Node.Type()) -} - -func converterFromFeature(feature *geojson.Feature) (*geojson.TypeConverter, error) { - schemaBuilder := &geojson.SchemaBuilder{} - schemaBuilder.Add(feature) - return schemaBuilder.Converter() -} - -func TestConverterSliceOfFloat(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Geometry: orb.Point{}, - Properties: map[string]any{ - path: []any{float64(42), float64(21)}, - }, - } - - converter, converterErr := converterFromFeature(feature) - require.NoError(t, converterErr) - - _, convertErr := converter.Convert(feature) - require.NoError(t, convertErr) -} - -func TestConverterSliceOfString(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Geometry: orb.Point{}, - Properties: map[string]any{ - path: []any{"one", "two"}, - }, - } - - converter, converterErr := converterFromFeature(feature) - require.NoError(t, converterErr) - - _, convertErr := converter.Convert(feature) - require.NoError(t, convertErr) -} - -func TestConverterSliceOfMixed(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Geometry: orb.Point{}, - Properties: map[string]any{ - path: []any{"one", "two"}, - }, - } - - converter, converterErr := converterFromFeature(feature) - require.NoError(t, converterErr) - - mixed := &geojson.Feature{ - Geometry: orb.Point{}, - Properties: map[string]any{ - path: []any{"oops", 42}, - }, - } - - _, convertErr := converter.Convert(mixed) - assert.EqualError(t, convertErr, "unable to convert value [oops 42] for \"test\": mixed array, expected string, but got int") -} - -func TestConverterNilGeometry(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Geometry: orb.Point{}, - Properties: map[string]any{ - path: "has geom", - }, - } - - converter, converterErr := converterFromFeature(feature) - require.NoError(t, converterErr) - - null := &geojson.Feature{ - Properties: map[string]any{ - path: "hasn't geom", - }, - } - - _, convertErr := converter.Convert(null) - assert.NoError(t, convertErr) -} - -func TestConverterNilSlice(t *testing.T) { - path := "test" - - feature := &geojson.Feature{ - Properties: map[string]any{ - path: []any{"one", "two"}, - }, - } - - converter, converterErr := converterFromFeature(feature) - require.NoError(t, converterErr) - - null := &geojson.Feature{ - Properties: map[string]any{}, - } - - _, convertErr := converter.Convert(null) - assert.NoError(t, convertErr) -} - -func TestSchemaBuilder(t *testing.T) { - prop1 := "test-property-1" - prop2 := "test-property-2" - - features := []*geojson.Feature{ - { - Properties: map[string]any{ - prop1: "test-value-1", - prop2: "test-value-2", - }, - }, - } - - schemaBuilder := &geojson.SchemaBuilder{} - complete := schemaBuilder.Add(features[0]) - assert.True(t, complete) - - schema, schemaErr := schemaBuilder.Schema() - require.NoError(t, schemaErr) - - require.NoError(t, schemaErr) - require.Len(t, schema.Fields(), 3) - - col1, ok := schema.Lookup(prop1) - require.True(t, ok) - assert.True(t, col1.Node.Optional()) - assert.Equal(t, parquet.String().Type(), col1.Node.Type()) - - col2, ok := schema.Lookup(prop2) - require.True(t, ok) - assert.True(t, col2.Node.Optional()) - assert.Equal(t, parquet.String().Type(), col2.Node.Type()) - - geom, ok := schema.Lookup("geometry") - require.True(t, ok) - assert.True(t, geom.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geom.Node.Type()) -} - -func TestSchemaBuilderSparse(t *testing.T) { - prop1 := "test-property-1" - prop2 := "test-property-2" - - features := []*geojson.Feature{ - { - Properties: map[string]any{ - prop1: "test-value-1", - }, - }, - { - Properties: map[string]any{ - prop2: "test-value-2", - }, - }, - } - - schemaBuilder := &geojson.SchemaBuilder{} - - assert.True(t, schemaBuilder.Add(features[0])) - assert.True(t, schemaBuilder.Add(features[1])) - - schema, schemaErr := schemaBuilder.Schema() - require.NoError(t, schemaErr) - - require.NoError(t, schemaErr) - require.Len(t, schema.Fields(), 3) - - col1, ok := schema.Lookup(prop1) - require.True(t, ok) - assert.True(t, col1.Node.Optional()) - assert.Equal(t, parquet.String().Type(), col1.Node.Type()) - - col2, ok := schema.Lookup(prop2) - require.True(t, ok) - assert.True(t, col2.Node.Optional()) - assert.Equal(t, parquet.String().Type(), col2.Node.Type()) - - geom, ok := schema.Lookup("geometry") - require.True(t, ok) - assert.True(t, geom.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geom.Node.Type()) -} - -func TestSchemaBuilderSparseNulls(t *testing.T) { - prop1 := "test-property-1" - prop2 := "test-property-2" - - features := []*geojson.Feature{ - { - Properties: map[string]any{ - prop1: "test-value-1", - prop2: nil, - }, - }, - { - Properties: map[string]any{ - prop1: nil, - prop2: nil, - }, - }, - { - Properties: map[string]any{ - prop1: nil, - prop2: "test-value-2", - }, - }, - } - - schemaBuilder := &geojson.SchemaBuilder{} - - assert.False(t, schemaBuilder.Add(features[0])) - assert.False(t, schemaBuilder.Add(features[1])) - assert.True(t, schemaBuilder.Add(features[2])) - - schema, schemaErr := schemaBuilder.Schema() - require.NoError(t, schemaErr) - - require.NoError(t, schemaErr) - require.Len(t, schema.Fields(), 3) - - col1, ok := schema.Lookup(prop1) - require.True(t, ok) - assert.True(t, col1.Node.Optional()) - assert.Equal(t, parquet.String().Type(), col1.Node.Type()) - - col2, ok := schema.Lookup(prop2) - require.True(t, ok) - assert.True(t, col2.Node.Optional()) - assert.Equal(t, parquet.String().Type(), col2.Node.Type()) - - geom, ok := schema.Lookup("geometry") - require.True(t, ok) - assert.True(t, geom.Node.Optional()) - assert.Equal(t, parquet.ByteArrayType, geom.Node.Type()) -} diff --git a/internal/geoparquet/featurewriter.go b/internal/geoparquet/featurewriter.go new file mode 100644 index 0000000..ba8bd0c --- /dev/null +++ b/internal/geoparquet/featurewriter.go @@ -0,0 +1,317 @@ +package geoparquet + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/pqarrow" + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" + "github.com/paulmach/orb/encoding/wkt" + "github.com/planetlabs/gpq/internal/geo" +) + +type FeatureWriter struct { + geoMetadata *Metadata + maxRowGroupLength int64 + bufferedLength int64 + fileWriter *pqarrow.FileWriter + recordBuilder *array.RecordBuilder + geometryTypeLookup map[string]map[string]bool + boundsLookup map[string]*orb.Bound +} + +func NewFeatureWriter(config *WriterConfig) (*FeatureWriter, error) { + parquetProps := config.ParquetWriterProps + if parquetProps == nil { + parquetProps = parquet.NewWriterProperties() + } + + arrowProps := config.ArrowWriterProps + if arrowProps == nil { + defaults := pqarrow.DefaultWriterProps() + arrowProps = &defaults + } + + geoMetadata := config.Metadata + if geoMetadata == nil { + geoMetadata = DefaultMetadata() + } + + if config.ArrowSchema == nil { + return nil, errors.New("schema is required") + } + + if config.Writer == nil { + return nil, errors.New("writer is required") + } + fileWriter, fileErr := pqarrow.NewFileWriter(config.ArrowSchema, config.Writer, parquetProps, *arrowProps) + if fileErr != nil { + return nil, fileErr + } + + writer := &FeatureWriter{ + geoMetadata: geoMetadata, + fileWriter: fileWriter, + maxRowGroupLength: parquetProps.MaxRowGroupLength(), + bufferedLength: 0, + recordBuilder: array.NewRecordBuilder(parquetProps.Allocator(), config.ArrowSchema), + geometryTypeLookup: map[string]map[string]bool{}, + boundsLookup: map[string]*orb.Bound{}, + } + + return writer, nil +} + +func (w *FeatureWriter) Write(feature *geo.Feature) error { + arrowSchema := w.recordBuilder.Schema() + numFields := arrowSchema.NumFields() + for i := 0; i < numFields; i++ { + field := arrowSchema.Field(i) + builder := w.recordBuilder.Field(i) + if err := w.append(feature, field, builder); err != nil { + return err + } + } + w.bufferedLength += 1 + if w.bufferedLength >= w.maxRowGroupLength { + return w.writeBuffered() + } + return nil +} + +func (w *FeatureWriter) writeBuffered() error { + record := w.recordBuilder.NewRecord() + defer record.Release() + if err := w.fileWriter.WriteBuffered(record); err != nil { + return err + } + w.bufferedLength = 0 + return nil +} + +func (w *FeatureWriter) append(feature *geo.Feature, field arrow.Field, builder array.Builder) error { + name := field.Name + if w.geoMetadata.Columns[name] != nil { + return w.appendGeometry(feature, field, builder) + } + + value, ok := feature.Properties[name] + if !ok || value == nil { + if !field.Nullable { + return fmt.Errorf("field %q is required, but the property is missing in the feature", name) + } + builder.AppendNull() + return nil + } + + return w.appendValue(name, value, builder) +} + +func (w *FeatureWriter) appendValue(name string, value any, builder array.Builder) error { + switch b := builder.(type) { + case *array.BooleanBuilder: + v, ok := value.(bool) + if !ok { + return fmt.Errorf("expected %q to be a boolean, got %v", name, value) + } + b.Append(v) + case *array.StringBuilder: + v, ok := value.(string) + if !ok { + return fmt.Errorf("expected %q to be a string, got %v", name, value) + } + b.Append(v) + case *array.Float64Builder: + v, ok := value.(float64) + if !ok { + return fmt.Errorf("expected %q to be a float64, got %v", name, value) + } + b.Append(v) + case *array.ListBuilder: + b.Append(true) + valueBuilder := b.ValueBuilder() + switch vb := valueBuilder.(type) { + case *array.BooleanBuilder: + v, ok := toUniformSlice[bool](value) + if !ok { + return fmt.Errorf("expected %q to be []bool, got %v", name, value) + } + vb.AppendValues(v, nil) + case *array.StringBuilder: + v, ok := toUniformSlice[string](value) + if !ok { + return fmt.Errorf("expected %q to be []string, got %v", name, value) + } + vb.AppendValues(v, nil) + case *array.Float64Builder: + v, ok := toUniformSlice[float64](value) + if !ok { + return fmt.Errorf("expected %q to be []float64, got %v", name, value) + } + vb.AppendValues(v, nil) + case *array.StructBuilder: + v, ok := value.([]any) + if !ok { + return fmt.Errorf("expected %q to be []any, got %v", name, value) + } + for _, item := range v { + if err := w.appendValue(name, item, vb); err != nil { + return err + } + } + default: + return fmt.Errorf("unsupported list element builder type %#v", vb) + } + case *array.StructBuilder: + v, ok := value.(map[string]any) + if !ok { + return fmt.Errorf("expected %q to be map[string]any, got %v", name, value) + } + t, ok := b.Type().(*arrow.StructType) + if !ok { + return fmt.Errorf("expected builder for %q to have a struct type, got %v", name, b.Type()) + } + b.Append(true) + for i := 0; i < b.NumField(); i += 1 { + field := t.Field(i) + name := field.Name + fieldValue, ok := v[name] + fieldBuilder := b.FieldBuilder(i) + if !ok || fieldValue == nil { + if !field.Nullable { + return fmt.Errorf("field %q is required, but the property is missing", name) + } + fieldBuilder.AppendNull() + continue + } + if err := w.appendValue(name, fieldValue, fieldBuilder); err != nil { + return err + } + } + default: + return fmt.Errorf("unsupported builder type %#v", b) + } + + return nil +} + +func toUniformSlice[T any](value any) ([]T, bool) { + if values, ok := value.([]T); ok { + return values, true + } + slice, ok := value.([]any) + if !ok { + return nil, false + } + values := make([]T, len(slice)) + for i, v := range slice { + t, ok := v.(T) + if !ok { + return nil, false + } + values[i] = t + } + return values, true +} + +func (w *FeatureWriter) appendGeometry(feature *geo.Feature, field arrow.Field, builder array.Builder) error { + name := field.Name + geomColumn := w.geoMetadata.Columns[name] + + binaryBuilder, ok := builder.(*array.BinaryBuilder) + if !ok { + return fmt.Errorf("expected column %q to have a binary type, got %s", name, builder.Type().Name()) + } + var geometry orb.Geometry + if name == w.geoMetadata.PrimaryColumn { + geometry = feature.Geometry + } else { + if value, ok := feature.Properties[name]; ok { + g, ok := value.(orb.Geometry) + if !ok { + return fmt.Errorf("expected %q to be a geometry, got %v", name, value) + } + geometry = g + } + } + if geometry == nil { + if !field.Nullable { + return fmt.Errorf("feature missing required %q geometry", name) + } + binaryBuilder.AppendNull() + return nil + } + + if w.geometryTypeLookup[name] == nil { + w.geometryTypeLookup[name] = map[string]bool{} + } + w.geometryTypeLookup[name][geometry.GeoJSONType()] = true + + bounds := geometry.Bound() + if w.boundsLookup[name] != nil { + bounds = bounds.Union(*w.boundsLookup[name]) + } + w.boundsLookup[name] = &bounds + + switch geomColumn.Encoding { + case geo.EncodingWKB: + data, err := wkb.Marshal(geometry) + if err != nil { + return fmt.Errorf("failed to encode %q as WKB: %w", name, err) + } + binaryBuilder.Append(data) + return nil + case geo.EncodingWKT: + binaryBuilder.Append(wkt.Marshal(geometry)) + return nil + default: + return fmt.Errorf("unsupported geometry encoding: %s", geomColumn.Encoding) + } +} + +func (w *FeatureWriter) Close() error { + defer w.recordBuilder.Release() + if w.bufferedLength > 0 { + if err := w.writeBuffered(); err != nil { + return err + } + } + + geoMetadata := w.geoMetadata.Clone() + for name, bounds := range w.boundsLookup { + if bounds != nil { + if geoMetadata.Columns[name] == nil { + geoMetadata.Columns[name] = getDefaultGeometryColumn() + } + geoMetadata.Columns[name].Bounds = []float64{ + bounds.Left(), bounds.Bottom(), bounds.Right(), bounds.Top(), + } + } + } + for name, types := range w.geometryTypeLookup { + geometryTypes := []string{} + if len(types) > 0 { + for geometryType := range types { + geometryTypes = append(geometryTypes, geometryType) + } + } + if geoMetadata.Columns[name] == nil { + geoMetadata.Columns[name] = getDefaultGeometryColumn() + } + geoMetadata.Columns[name].GeometryTypes = geometryTypes + } + + data, err := json.Marshal(geoMetadata) + if err != nil { + return fmt.Errorf("failed to encode %s file metadata", MetadataKey) + } + if err := w.fileWriter.AppendKeyValueMetadata(MetadataKey, string(data)); err != nil { + return fmt.Errorf("failed to append %s file metadata", MetadataKey) + } + return w.fileWriter.Close() +} diff --git a/internal/geoparquet/geoparquet.go b/internal/geoparquet/geoparquet.go index 3761b63..8b36123 100644 --- a/internal/geoparquet/geoparquet.go +++ b/internal/geoparquet/geoparquet.go @@ -1,500 +1,171 @@ -// Copyright 2023 Planet Labs PBC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - package geoparquet import ( "encoding/json" "fmt" "io" - "strings" - "github.com/paulmach/orb" + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/compress" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/schema" "github.com/paulmach/orb/encoding/wkb" "github.com/paulmach/orb/encoding/wkt" - "github.com/segmentio/parquet-go" - "github.com/segmentio/parquet-go/compress" -) - -const ( - Version = "1.0.0" - MetadataKey = "geo" - EncodingWKB = "WKB" - EncodingWKT = "WKT" - EdgesPlanar = "planar" - EdgesSpherical = "spherical" - OrientationCounterClockwise = "counterclockwise" - DefaultGeometryColumn = "geometry" + "github.com/planetlabs/gpq/internal/geo" + "github.com/planetlabs/gpq/internal/pqutil" ) -var GeometryTypes = []string{ - "Point", - "LineString", - "Polygon", - "MultiPoint", - "MultiLineString", - "MultiPolygon", - "GeometryCollection", - "Point Z", - "LineString Z", - "Polygon Z", - "MultiPoint Z", - "MultiLineString Z", - "MultiPolygon Z", - "GeometryCollection Z", -} - -type Metadata struct { - Version string `json:"version"` - PrimaryColumn string `json:"primary_column"` - Columns map[string]*GeometryColumn `json:"columns"` -} - -func (m *Metadata) Clone() *Metadata { - clone := &Metadata{} - *clone = *m - clone.Columns = make(map[string]*GeometryColumn, len(m.Columns)) - for i, v := range m.Columns { - clone.Columns[i] = v.clone() - } - return clone -} - -type ProjId struct { - Authority string `json:"authority"` - Code any `json:"code"` -} - -type Proj struct { - Name string `json:"name"` - Id *ProjId `json:"id"` -} - -func (p *Proj) String() string { - id := "" - if p.Id != nil { - if code, ok := p.Id.Code.(string); ok { - id = p.Id.Authority + ":" + code - } else if code, ok := p.Id.Code.(float64); ok { - id = fmt.Sprintf("%s:%g", p.Id.Authority, code) - } - } - if p.Name != "" { - return p.Name - } - if id == "" { - return "Unknown" - } - return id -} - -type GeometryColumn struct { - Encoding string `json:"encoding"` - GeometryType any `json:"geometry_type,omitempty"` - GeometryTypes any `json:"geometry_types"` - CRS *Proj `json:"crs,omitempty"` - Edges string `json:"edges,omitempty"` - Orientation string `json:"orientation,omitempty"` - Bounds []float64 `json:"bbox,omitempty"` - Epoch float64 `json:"epoch,omitempty"` -} - -func (g *GeometryColumn) clone() *GeometryColumn { - clone := &GeometryColumn{} - *clone = *g - clone.Bounds = make([]float64, len(g.Bounds)) - copy(clone.Bounds, g.Bounds) - return clone -} - -func (col *GeometryColumn) GetGeometryTypes() []string { - if multiType, ok := col.GeometryTypes.([]any); ok { - types := make([]string, len(multiType)) - for i, value := range multiType { - geometryType, ok := value.(string) - if !ok { - return nil - } - types[i] = geometryType - } - return types - } - - if singleType, ok := col.GeometryType.(string); ok { - return []string{singleType} - } - - values, ok := col.GeometryType.([]any) - if !ok { - return nil - } - - types := make([]string, len(values)) - for i, value := range values { - geometryType, ok := value.(string) - if !ok { - return nil - } - types[i] = geometryType - } - - return types -} - -func getDefaultGeometryColumn() *GeometryColumn { - return &GeometryColumn{ - Encoding: EncodingWKB, - GeometryTypes: []string{}, - } -} - -func DefaultMetadata() *Metadata { - return &Metadata{ - Version: Version, - PrimaryColumn: DefaultGeometryColumn, - Columns: map[string]*GeometryColumn{ - DefaultGeometryColumn: getDefaultGeometryColumn(), - }, - } -} - -var ErrNoMetadata = fmt.Errorf("missing %s metadata key", MetadataKey) - -func GetMetadataValue(file *parquet.File) (string, error) { - value, ok := file.Lookup(MetadataKey) - if !ok { - return "", ErrNoMetadata - } - return value, nil -} - -func GetMetadata(file *parquet.File) (*Metadata, error) { - value, valueErr := GetMetadataValue(file) - if valueErr != nil { - return nil, valueErr - } - geoFileMetadata := &Metadata{} - jsonErr := json.Unmarshal([]byte(value), geoFileMetadata) - if jsonErr != nil { - return nil, fmt.Errorf("unable to parse geo metadata: %w", jsonErr) - } - return geoFileMetadata, nil -} - -const defaultBatchSize = 128 - -type RowReader struct { - file *parquet.File - groups []parquet.RowGroup - groupIndex int - rowIndex int - rowBuffer []parquet.Row - rowsRead int - reader parquet.Rows -} - -func NewRowReader(file *parquet.File) *RowReader { - groups := file.RowGroups() - - return &RowReader{ - file: file, - groups: groups, - rowBuffer: make([]parquet.Row, defaultBatchSize), - } -} - -func (r *RowReader) closeReader() error { - if r.reader == nil { - return nil - } - err := r.reader.Close() - r.reader = nil - return err -} - -func (r *RowReader) Next() (parquet.Row, error) { - if r.groupIndex >= len(r.groups) { - return nil, io.EOF - } - - if r.rowIndex == 0 { - if r.reader == nil { - group := r.groups[r.groupIndex] - r.reader = group.Rows() - } - rowsRead, readErr := r.reader.ReadRows(r.rowBuffer) - r.rowsRead = rowsRead - if readErr != nil { - closeErr := r.closeReader() - if readErr != io.EOF { - return nil, readErr - } - if closeErr != nil { - return nil, closeErr - } - } - } - - if r.rowIndex >= r.rowsRead { - r.rowIndex = 0 - if r.rowsRead < len(r.rowBuffer) { - if err := r.closeReader(); err != nil { - return nil, err - } - r.groupIndex += 1 - } - return r.Next() - } - - row := r.rowBuffer[r.rowIndex] - r.rowIndex += 1 - return row, nil -} - -func (r *RowReader) Close() error { - return r.closeReader() -} - -type GenericWriter[T any] struct { - writer *parquet.GenericWriter[T] - metadata *Metadata -} - -func NewGenericWriter[T any](output io.Writer, metadata *Metadata, options ...parquet.WriterOption) *GenericWriter[T] { - return &GenericWriter[T]{ - writer: parquet.NewGenericWriter[T](output, options...), - metadata: metadata, - } -} - -func (w *GenericWriter[T]) Write(rows []T) (int, error) { - return w.writer.Write(rows) -} - -func (w *GenericWriter[T]) Close() error { - jsonMetadata, jsonErr := json.Marshal(w.metadata) - if jsonErr != nil { - return fmt.Errorf("trouble encoding metadata as json: %w", jsonErr) - } - - w.writer.SetKeyValueMetadata(MetadataKey, string(jsonMetadata)) - return w.writer.Close() -} - -var stringType = parquet.String().Type() - -func Geometry(value any, name string, metadata *Metadata, schema *parquet.Schema) (orb.Geometry, string, error) { - geometryString, ok := value.(string) - if !ok { - return nil, "", fmt.Errorf("unexpected geometry type: %t", value) - } - - encoding := metadata.Columns[name].Encoding - if encoding == "" { - column, ok := schema.Lookup(name) - if !ok { - return nil, "", fmt.Errorf("missing column: %s", name) - } - nodeType := column.Node.Type() - if nodeType == stringType { - encoding = EncodingWKT - } else if nodeType == parquet.ByteArrayType { - encoding = EncodingWKB - } else { - return nil, "", fmt.Errorf("unsupported geometry type: %s", nodeType) - } - } - - var geometry orb.Geometry - - switch strings.ToUpper(encoding) { - case EncodingWKB: - g, err := wkb.Unmarshal([]byte(geometryString)) - if err != nil { - return nil, "", fmt.Errorf("trouble reading geometry as WKB: %w", err) - } - geometry = g - case EncodingWKT: - g, err := wkt.Unmarshal(geometryString) - if err != nil { - return nil, "", fmt.Errorf("trouble reading geometry as WKT: %w", err) - } - geometry = g - default: - return nil, "", fmt.Errorf("unsupported encoding: %s", encoding) - } - - return geometry, strings.ToUpper(encoding), nil -} - -func GetCodec(codec string) (compress.Codec, error) { - switch codec { - case "uncompressed": - return &parquet.Uncompressed, nil - case "snappy": - return &parquet.Snappy, nil - case "gzip": - return &parquet.Gzip, nil - case "brotli": - return &parquet.Brotli, nil - case "zstd": - return &parquet.Zstd, nil - case "lz4raw": - return &parquet.Lz4Raw, nil - default: - return nil, fmt.Errorf("invalid compression codec %s", codec) - } -} - type ConvertOptions struct { InputPrimaryColumn string Compression string } -func FromParquet(file *parquet.File, output io.Writer, convertOptions *ConvertOptions) error { +func FromParquet(input parquet.ReaderAtSeeker, output io.Writer, convertOptions *ConvertOptions) error { if convertOptions == nil { convertOptions = &ConvertOptions{} } - reader := NewRowReader(file) - schema := file.Schema() - - codec := schema.Compression() - if convertOptions.Compression != "" { - candidate, codecErr := GetCodec(convertOptions.Compression) - if codecErr != nil { - return codecErr - } - codec = candidate + primaryColumn := DefaultGeometryColumn + if convertOptions.InputPrimaryColumn != "" { + primaryColumn = convertOptions.InputPrimaryColumn } - options := []parquet.WriterOption{ - parquet.Compression(codec), - schema, + metadata := &Metadata{ + PrimaryColumn: primaryColumn, + Columns: map[string]*GeometryColumn{ + primaryColumn: getDefaultGeometryColumn(), + }, } - writerConfig, configErr := parquet.NewWriterConfig(options...) - if configErr != nil { - return configErr + var compression *compress.Compression + if convertOptions.Compression != "" { + c, err := pqutil.GetCompression(convertOptions.Compression) + if err != nil { + return err + } + compression = &c } - writer := parquet.NewGenericWriter[any](output, writerConfig) - - boundsLookup := map[string]*orb.Bound{} - geometryTypeLookup := map[string]map[string]bool{} - - inputMetadata, metadataErr := GetMetadata(file) - if metadataErr != nil { - primaryColumn := DefaultGeometryColumn - if convertOptions.InputPrimaryColumn != "" { - primaryColumn = convertOptions.InputPrimaryColumn - } - inputMetadata = &Metadata{ - PrimaryColumn: primaryColumn, - Columns: map[string]*GeometryColumn{ - primaryColumn: {}, - }, - } + parquetSchema, schemaErr := pqutil.GetParquetSchema(input) + if schemaErr != nil { + return fmt.Errorf("trouble getting parquet schema: %w", schemaErr) } - outputMetadata := inputMetadata.Clone() - for { - row, err := reader.Next() - if err == io.EOF { - break + datasetInfo := geo.NewDatasetInfo(true) + for fieldNum := 0; fieldNum < parquetSchema.Root().NumFields(); fieldNum += 1 { + field := parquetSchema.Root().Field(fieldNum) + name := field.Name() + if _, ok := metadata.Columns[name]; !ok { + continue } - if err != nil { - return err + if field.LogicalType() == pqutil.ParquetStringType { + datasetInfo.AddCollection(name) } + } - properties := map[string]any{} - if err := schema.Reconstruct(&properties, row); err != nil { - return err - } + var transformSchema pqutil.SchemaTransformer + var transformColumn pqutil.ColumnTransformer + if datasetInfo.NumCollections() > 0 { + transformSchema = func(inputSchema *schema.Schema) (*schema.Schema, error) { + inputRoot := inputSchema.Root() + numFields := inputRoot.NumFields() - for name, inputColumn := range inputMetadata.Columns { - value, ok := properties[name] - if !ok { - return fmt.Errorf("missing geometry column: %s", name) - } - geometry, encoding, err := Geometry(value, name, inputMetadata, schema) - if err != nil { - return err - } - - if encoding != EncodingWKB { - column, ok := schema.Lookup(name) - if !ok { - return fmt.Errorf("missing geometry column: %s", name) + fields := make([]schema.Node, numFields) + for fieldNum := 0; fieldNum < numFields; fieldNum += 1 { + inputField := inputRoot.Field(fieldNum) + if !datasetInfo.HasCollection(inputField.Name()) { + fields[fieldNum] = inputField + continue } - geomBytes, wkbErr := wkb.Marshal(geometry) - if wkbErr != nil { - return fmt.Errorf("failed to encode %q geometry as wkb: %w", name, wkbErr) + outputField, err := schema.NewPrimitiveNode(inputField.Name(), inputField.RepetitionType(), parquet.Types.ByteArray, -1, -1) + if err != nil { + return nil, err } - row[column.ColumnIndex] = parquet.ValueOf(geomBytes) + fields[fieldNum] = outputField } - if inputColumn.Encoding != EncodingWKB { - outputMetadata.Columns[name].Encoding = EncodingWKB + outputRoot, err := schema.NewGroupNode(inputRoot.Name(), inputRoot.RepetitionType(), fields, -1) + if err != nil { + return nil, err } + return schema.NewSchema(outputRoot), nil + } - bounds := geometry.Bound() - if boundsLookup[name] != nil { - bounds = bounds.Union(*boundsLookup[name]) + transformColumn = func(inputField *arrow.Field, outputField *arrow.Field, chunked *arrow.Chunked) (*arrow.Chunked, error) { + if !datasetInfo.HasCollection(inputField.Name) { + return chunked, nil } - boundsLookup[name] = &bounds - - if geometryTypeLookup[name] == nil { - geometryTypeLookup[name] = map[string]bool{} + chunks := chunked.Chunks() + transformed := make([]arrow.Array, len(chunks)) + builder := array.NewBinaryBuilder(memory.DefaultAllocator, arrow.BinaryTypes.Binary) + defer builder.Release() + + collectionInfo := geo.NewCollectionInfo(false) + for i, arr := range chunks { + stringArray, ok := arr.(*array.String) + if !ok { + return nil, fmt.Errorf("expected a string array for %q, got %v", inputField.Name, arr) + } + for rowNum := 0; rowNum < stringArray.Len(); rowNum += 1 { + if outputField.Nullable && stringArray.IsNull(rowNum) { + builder.AppendNull() + continue + } + str := stringArray.Value(rowNum) + geometry, wktErr := wkt.Unmarshal(str) + if wktErr != nil { + return nil, wktErr + } + value, wkbErr := wkb.Marshal(geometry) + if wkbErr != nil { + return nil, wkbErr + } + collectionInfo.AddType(geometry.GeoJSONType()) + bounds := geometry.Bound() + collectionInfo.AddBounds(&bounds) + builder.Append(value) + } + transformed[i] = builder.NewArray() } - geometryTypeLookup[name][geometry.GeoJSONType()] = true - } - - _, writeErr := writer.WriteRows([]parquet.Row{row}) - if writeErr != nil { - return writeErr + datasetInfo.AddBounds(inputField.Name, collectionInfo.Bounds()) + datasetInfo.AddTypes(inputField.Name, collectionInfo.Types()) + chunked.Release() + return arrow.NewChunked(builder.Type(), transformed), nil } } - for name, bounds := range boundsLookup { - if bounds != nil { - if inputMetadata.Columns[name] == nil { - outputMetadata.Columns[name] = getDefaultGeometryColumn() + beforeClose := func(fileWriter *file.Writer) error { + for name, geometryCol := range metadata.Columns { + if !datasetInfo.HasCollection(name) { + continue } - outputMetadata.Columns[name].Bounds = []float64{ + bounds := datasetInfo.Bounds(name) + geometryCol.Bounds = []float64{ bounds.Left(), bounds.Bottom(), bounds.Right(), bounds.Top(), } + geometryCol.GeometryTypes = datasetInfo.Types(name) } - } - - for name, types := range geometryTypeLookup { - geometryTypes := []string{} - if len(types) > 0 { - for geometryType := range types { - geometryTypes = append(geometryTypes, geometryType) - } + encodedMetadata, jsonErr := json.Marshal(metadata) + if jsonErr != nil { + return fmt.Errorf("trouble encoding %q metadata: %w", MetadataKey, jsonErr) } - if inputMetadata.Columns[name] == nil { - outputMetadata.Columns[name] = getDefaultGeometryColumn() + if err := fileWriter.AppendKeyValueMetadata(MetadataKey, string(encodedMetadata)); err != nil { + return fmt.Errorf("trouble appending %q metadata: %w", MetadataKey, err) } - outputMetadata.Columns[name].GeometryTypes = geometryTypes + return nil } - metadataBytes, jsonErr := json.Marshal(outputMetadata) - if jsonErr != nil { - return fmt.Errorf("failed to serialize geo metadata: %w", jsonErr) + config := &pqutil.TransformConfig{ + Reader: input, + Writer: output, + TransformSchema: transformSchema, + TransformColumn: transformColumn, + BeforeClose: beforeClose, + Compression: compression, } - writer.SetKeyValueMetadata(MetadataKey, string(metadataBytes)) - return writer.Close() + + return pqutil.TransformByColumn(config) } diff --git a/internal/geoparquet/geoparquet_test.go b/internal/geoparquet/geoparquet_test.go index 2469a42..2df7d63 100644 --- a/internal/geoparquet/geoparquet_test.go +++ b/internal/geoparquet/geoparquet_test.go @@ -16,35 +16,38 @@ package geoparquet_test import ( "bytes" - "encoding/json" - "fmt" + "context" "io" "os" "testing" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/pqarrow" "github.com/paulmach/orb" "github.com/paulmach/orb/encoding/wkb" - "github.com/paulmach/orb/encoding/wkt" + "github.com/planetlabs/gpq/internal/geo" "github.com/planetlabs/gpq/internal/geoparquet" - "github.com/segmentio/parquet-go" + "github.com/planetlabs/gpq/internal/test" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func TestGetMetadataV040(t *testing.T) { - fixturePath := "../testdata/cases/example-v0.4.0.parquet" - info, statErr := os.Stat(fixturePath) - require.NoError(t, statErr) - - input, openErr := os.Open(fixturePath) - require.NoError(t, openErr) - - file, fileErr := parquet.OpenFile(input, info.Size()) - require.NoError(t, fileErr) +func newFileReader(filepath string) (*file.Reader, error) { + f, fileErr := os.Open(filepath) + if fileErr != nil { + return nil, fileErr + } + return file.NewParquetReader(f) +} - metadata, geoErr := geoparquet.GetMetadata(file) - require.NoError(t, geoErr) +func TestGetMetadataV040(t *testing.T) { + reader, readerErr := newFileReader("../testdata/cases/example-v0.4.0.parquet") + require.NoError(t, readerErr) + defer reader.Close() + metadata, metadataErr := geoparquet.GetMetadata(reader.MetaData().GetKeyValueMetadata()) + require.NoError(t, metadataErr) assert.Equal(t, "geometry", metadata.PrimaryColumn) assert.Equal(t, "0.4.0", metadata.Version) require.Len(t, metadata.Columns, 1) @@ -57,19 +60,12 @@ func TestGetMetadataV040(t *testing.T) { } func TestGetMetadataV100Beta1(t *testing.T) { - fixturePath := "../testdata/cases/example-v1.0.0-beta.1.parquet" - info, statErr := os.Stat(fixturePath) - require.NoError(t, statErr) - - input, openErr := os.Open(fixturePath) - require.NoError(t, openErr) - - file, fileErr := parquet.OpenFile(input, info.Size()) - require.NoError(t, fileErr) - - metadata, geoErr := geoparquet.GetMetadata(file) - require.NoError(t, geoErr) + reader, readerErr := newFileReader("../testdata/cases/example-v1.0.0-beta.1.parquet") + require.NoError(t, readerErr) + defer reader.Close() + metadata, metadataErr := geoparquet.GetMetadata(reader.MetaData().GetKeyValueMetadata()) + require.NoError(t, metadataErr) assert.Equal(t, "geometry", metadata.PrimaryColumn) assert.Equal(t, "1.0.0-beta.1", metadata.Version) require.Len(t, metadata.Columns, 1) @@ -88,18 +84,12 @@ func TestGetMetadataV100Beta1(t *testing.T) { } func TestGetMetadataV1(t *testing.T) { - fixturePath := "../testdata/cases/example-v1.0.0.parquet" - info, statErr := os.Stat(fixturePath) - require.NoError(t, statErr) - - input, openErr := os.Open(fixturePath) - require.NoError(t, openErr) - - file, fileErr := parquet.OpenFile(input, info.Size()) - require.NoError(t, fileErr) + reader, readerErr := newFileReader("../testdata/cases/example-v1.0.0.parquet") + require.NoError(t, readerErr) + defer reader.Close() - metadata, geoErr := geoparquet.GetMetadata(file) - require.NoError(t, geoErr) + metadata, metadataErr := geoparquet.GetMetadata(reader.MetaData().GetKeyValueMetadata()) + require.NoError(t, metadataErr) assert.Equal(t, "geometry", metadata.PrimaryColumn) assert.Equal(t, "1.0.0", metadata.Version) @@ -113,155 +103,93 @@ func TestGetMetadataV1(t *testing.T) { assert.Contains(t, geomTypes, "MultiPolygon") } -func TestRowReaderV040(t *testing.T) { +func TestRecordReaderV040(t *testing.T) { fixturePath := "../testdata/cases/example-v0.4.0.parquet" - info, statErr := os.Stat(fixturePath) - require.NoError(t, statErr) - input, openErr := os.Open(fixturePath) require.NoError(t, openErr) - file, fileErr := parquet.OpenFile(input, info.Size()) - require.NoError(t, fileErr) + reader, err := geoparquet.NewRecordReader(&geoparquet.ReaderConfig{ + Reader: input, + }) + require.NoError(t, err) - reader := geoparquet.NewRowReader(file) - rows := []parquet.Row{} + numRows := 0 for { - row, err := reader.Next() + record, err := reader.Read() if err == io.EOF { break } require.NoError(t, err) - require.NotNil(t, row) - rows = append(rows, row) + numRows += int(record.NumRows()) } - assert.Len(t, rows, int(file.NumRows())) - - schema := file.Schema() - firstRow := rows[0] - - continentCol, ok := schema.Lookup("continent") - require.True(t, ok) - continent := firstRow[continentCol.ColumnIndex] - assert.Equal(t, "Oceania", continent.String()) - - nameCol, ok := schema.Lookup("name") - require.True(t, ok) - name := firstRow[nameCol.ColumnIndex] - assert.Equal(t, "Fiji", name.String()) + assert.Equal(t, 5, numRows) } func TestRowReaderV100Beta1(t *testing.T) { fixturePath := "../testdata/cases/example-v1.0.0-beta.1.parquet" - info, statErr := os.Stat(fixturePath) - require.NoError(t, statErr) - input, openErr := os.Open(fixturePath) require.NoError(t, openErr) - file, fileErr := parquet.OpenFile(input, info.Size()) - require.NoError(t, fileErr) + reader, err := geoparquet.NewRecordReader(&geoparquet.ReaderConfig{ + Reader: input, + }) + require.NoError(t, err) - reader := geoparquet.NewRowReader(file) - rows := []parquet.Row{} + numRows := 0 for { - row, err := reader.Next() + record, err := reader.Read() if err == io.EOF { break } require.NoError(t, err) - require.NotNil(t, row) - rows = append(rows, row) + numRows += int(record.NumRows()) } - assert.Len(t, rows, int(file.NumRows())) - - schema := file.Schema() - firstRow := rows[0] - - continentCol, ok := schema.Lookup("continent") - require.True(t, ok) - continent := firstRow[continentCol.ColumnIndex] - assert.Equal(t, "Oceania", continent.String()) - - nameCol, ok := schema.Lookup("name") - require.True(t, ok) - name := firstRow[nameCol.ColumnIndex] - assert.Equal(t, "Fiji", name.String()) + assert.Equal(t, 5, numRows) } -func makeParquet[T any](rows []T, metadata *geoparquet.Metadata) (*parquet.File, error) { - data := &bytes.Buffer{} - - writer := parquet.NewGenericWriter[T](data) - - _, writeErr := writer.Write(rows) - if writeErr != nil { - return nil, fmt.Errorf("trouble writing rows: %w", writeErr) - } - - if metadata != nil { - jsonMetadata, jsonErr := json.Marshal(metadata) - if jsonErr != nil { - return nil, fmt.Errorf("trouble encoding metadata as json: %w", jsonErr) - } - - writer.SetKeyValueMetadata(geoparquet.MetadataKey, string(jsonMetadata)) - } - - closeErr := writer.Close() - if closeErr != nil { - return nil, fmt.Errorf("trouble closing writer: %w", closeErr) - } - - return parquet.OpenFile(bytes.NewReader(data.Bytes()), int64(data.Len())) +func toWKB(t *testing.T, geometry orb.Geometry) []byte { + data, err := wkb.Marshal(geometry) + require.NoError(t, err) + return data } func TestFromParquetWithoutMetadata(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geometry []byte `parquet:"geometry"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry []byte `parquet:"name=geometry" json:"geometry"` } - point, pointErr := wkb.Marshal(orb.Point{1, 2}) - require.NoError(t, pointErr) - rows := []*Row{ { Name: "test-point", - Geometry: point, + Geometry: toWKB(t, orb.Point{1, 2}), }, } - parquetFile, inputErr := makeParquet(rows, nil) - require.NoError(t, inputErr) + input := test.ParquetFromStructs(t, rows) output := &bytes.Buffer{} - convertErr := geoparquet.FromParquet(parquetFile, output, nil) + convertErr := geoparquet.FromParquet(input, output, nil) require.NoError(t, convertErr) geoparquetInput := bytes.NewReader(output.Bytes()) - geoparquetFile, outputErr := parquet.OpenFile(geoparquetInput, geoparquetInput.Size()) - require.NoError(t, outputErr) - metadata, geoErr := geoparquet.GetMetadata(geoparquetFile) - require.NoError(t, geoErr) + reader, err := file.NewParquetReader(geoparquetInput) + require.NoError(t, err) + defer reader.Close() + + metadata, err := geoparquet.GetMetadata(reader.MetaData().KeyValueMetadata()) + require.NoError(t, err) assert.Len(t, metadata.Columns, 1) primaryColumnMetadata := metadata.Columns[metadata.PrimaryColumn] - geometryTypes := primaryColumnMetadata.GetGeometryTypes() - assert.Len(t, geometryTypes, 1) - assert.Contains(t, geometryTypes, "Point") - - bounds := primaryColumnMetadata.Bounds - assert.Equal(t, []float64{1, 2, 1, 2}, bounds) - - assert.Equal(t, geoparquet.EncodingWKB, primaryColumnMetadata.Encoding) + assert.Equal(t, geo.EncodingWKB, primaryColumnMetadata.Encoding) - assert.Equal(t, int64(1), geoparquetFile.NumRows()) + assert.Equal(t, int64(1), reader.NumRows()) } func TestMetadataClone(t *testing.T) { @@ -287,34 +215,34 @@ func TestMetadataClone(t *testing.T) { func TestFromParquetWithWKT(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geometry string `parquet:"geometry"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry string `parquet:"name=geometry, logical=String" json:"geometry"` } rows := []*Row{ { Name: "test-point-1", - Geometry: string(wkt.Marshal(orb.Point{1, 2})), + Geometry: "POINT (1 2)", }, { Name: "test-point-2", - Geometry: string(wkt.Marshal(orb.Point{3, 4})), + Geometry: "POINT (3 4)", }, } - parquetFile, inputErr := makeParquet(rows, nil) - require.NoError(t, inputErr) + input := test.ParquetFromStructs(t, rows) output := &bytes.Buffer{} - convertErr := geoparquet.FromParquet(parquetFile, output, nil) + convertErr := geoparquet.FromParquet(input, output, nil) require.NoError(t, convertErr) geoparquetInput := bytes.NewReader(output.Bytes()) - geoparquetFile, outputErr := parquet.OpenFile(geoparquetInput, geoparquetInput.Size()) - require.NoError(t, outputErr) + reader, err := file.NewParquetReader(geoparquetInput) + require.NoError(t, err) + defer reader.Close() - metadata, geoErr := geoparquet.GetMetadata(geoparquetFile) - require.NoError(t, geoErr) + metadata, err := geoparquet.GetMetadata(reader.MetaData().KeyValueMetadata()) + require.NoError(t, err) assert.Len(t, metadata.Columns, 1) @@ -327,41 +255,82 @@ func TestFromParquetWithWKT(t *testing.T) { bounds := primaryColumnMetadata.Bounds assert.Equal(t, []float64{1, 2, 3, 4}, bounds) - assert.Equal(t, geoparquet.EncodingWKB, primaryColumnMetadata.Encoding) + assert.Equal(t, geo.EncodingWKB, primaryColumnMetadata.Encoding) - assert.Equal(t, int64(2), geoparquetFile.NumRows()) + assert.Equal(t, int64(2), reader.NumRows()) } func TestFromParquetWithAltPrimaryColumn(t *testing.T) { type Row struct { - Name string `parquet:"name"` - Geo string `parquet:"geo"` + Name string `parquet:"name=name, logical=String" json:"name"` + Geo []byte `parquet:"name=geo" json:"geo"` } rows := []*Row{ { Name: "test-point-1", - Geo: string(wkt.Marshal(orb.Point{1, 2})), + Geo: toWKB(t, orb.Point{1, 2}), }, { Name: "test-point-2", - Geo: string(wkt.Marshal(orb.Point{3, 4})), + Geo: toWKB(t, orb.Point{3, 4}), }, } - parquetFile, inputErr := makeParquet(rows, nil) - require.NoError(t, inputErr) + input := test.ParquetFromStructs(t, rows) + + primaryColumn := "geo" output := &bytes.Buffer{} - convertErr := geoparquet.FromParquet(parquetFile, output, &geoparquet.ConvertOptions{InputPrimaryColumn: "geo"}) + convertErr := geoparquet.FromParquet(input, output, &geoparquet.ConvertOptions{InputPrimaryColumn: primaryColumn}) require.NoError(t, convertErr) geoparquetInput := bytes.NewReader(output.Bytes()) - geoparquetFile, outputErr := parquet.OpenFile(geoparquetInput, geoparquetInput.Size()) - require.NoError(t, outputErr) + reader, err := file.NewParquetReader(geoparquetInput) + require.NoError(t, err) + defer reader.Close() - metadata, geoErr := geoparquet.GetMetadata(geoparquetFile) - require.NoError(t, geoErr) + metadata, err := geoparquet.GetMetadata(reader.MetaData().KeyValueMetadata()) + require.NoError(t, err) + + assert.Equal(t, primaryColumn, metadata.PrimaryColumn) + assert.Len(t, metadata.Columns, 1) + primaryColumnMetadata := metadata.Columns[metadata.PrimaryColumn] + assert.Equal(t, geo.EncodingWKB, primaryColumnMetadata.Encoding) + + assert.Equal(t, int64(2), reader.NumRows()) +} + +func TestFromParquetWithAltPrimaryColumnWKT(t *testing.T) { + type Row struct { + Name string `parquet:"name=name, logical=String" json:"name"` + Geo string `parquet:"name=geo, logical=String" json:"geo"` + } + + rows := []*Row{ + { + Name: "test-point-1", + Geo: "POINT (1 2)", + }, + { + Name: "test-point-2", + Geo: "POINT (3 4)", + }, + } + + input := test.ParquetFromStructs(t, rows) + + output := &bytes.Buffer{} + convertErr := geoparquet.FromParquet(input, output, &geoparquet.ConvertOptions{InputPrimaryColumn: "geo"}) + require.NoError(t, convertErr) + + geoparquetInput := bytes.NewReader(output.Bytes()) + reader, err := file.NewParquetReader(geoparquetInput) + require.NoError(t, err) + defer reader.Close() + + metadata, err := geoparquet.GetMetadata(reader.MetaData().KeyValueMetadata()) + require.NoError(t, err) assert.Len(t, metadata.Columns, 1) @@ -374,7 +343,34 @@ func TestFromParquetWithAltPrimaryColumn(t *testing.T) { bounds := primaryColumnMetadata.Bounds assert.Equal(t, []float64{1, 2, 3, 4}, bounds) - assert.Equal(t, geoparquet.EncodingWKB, primaryColumnMetadata.Encoding) + assert.Equal(t, geo.EncodingWKB, primaryColumnMetadata.Encoding) + + assert.Equal(t, int64(2), reader.NumRows()) +} + +func TestRecordReading(t *testing.T) { + f, fileErr := os.Open("../testdata/cases/example-v1.0.0-beta.1.parquet") + require.NoError(t, fileErr) + reader, readerErr := file.NewParquetReader(f) + require.NoError(t, readerErr) + defer reader.Close() + + pqReader, pqErr := pqarrow.NewFileReader(reader, pqarrow.ArrowReadProperties{BatchSize: 10}, memory.DefaultAllocator) + require.NoError(t, pqErr) + + recordReader, rrErr := pqReader.GetRecordReader(context.Background(), nil, nil) + require.NoError(t, rrErr) + + numRows := 0 + for { + rec, err := recordReader.Read() + if err == io.EOF { + assert.Nil(t, rec) + break + } + assert.NoError(t, err) + numRows += int(rec.NumRows()) + } - assert.Equal(t, int64(2), geoparquetFile.NumRows()) + assert.Equal(t, reader.NumRows(), int64(numRows)) } diff --git a/internal/geoparquet/metadata.go b/internal/geoparquet/metadata.go new file mode 100644 index 0000000..e16de1c --- /dev/null +++ b/internal/geoparquet/metadata.go @@ -0,0 +1,182 @@ +package geoparquet + +import ( + "encoding/json" + "fmt" + + "github.com/apache/arrow/go/v14/parquet/metadata" + "github.com/planetlabs/gpq/internal/geo" +) + +const ( + Version = "1.0.0" + MetadataKey = "geo" + EdgesPlanar = "planar" + EdgesSpherical = "spherical" + OrientationCounterClockwise = "counterclockwise" + DefaultGeometryColumn = "geometry" + DefaultGeometryEncoding = geo.EncodingWKB +) + +var GeometryTypes = []string{ + "Point", + "LineString", + "Polygon", + "MultiPoint", + "MultiLineString", + "MultiPolygon", + "GeometryCollection", + "Point Z", + "LineString Z", + "Polygon Z", + "MultiPoint Z", + "MultiLineString Z", + "MultiPolygon Z", + "GeometryCollection Z", +} + +type Metadata struct { + Version string `json:"version"` + PrimaryColumn string `json:"primary_column"` + Columns map[string]*GeometryColumn `json:"columns"` +} + +func (m *Metadata) Clone() *Metadata { + clone := &Metadata{} + *clone = *m + clone.Columns = make(map[string]*GeometryColumn, len(m.Columns)) + for i, v := range m.Columns { + clone.Columns[i] = v.clone() + } + return clone +} + +type ProjId struct { + Authority string `json:"authority"` + Code any `json:"code"` +} + +type Proj struct { + Name string `json:"name"` + Id *ProjId `json:"id"` +} + +func (p *Proj) String() string { + id := "" + if p.Id != nil { + if code, ok := p.Id.Code.(string); ok { + id = p.Id.Authority + ":" + code + } else if code, ok := p.Id.Code.(float64); ok { + id = fmt.Sprintf("%s:%g", p.Id.Authority, code) + } + } + if p.Name != "" { + return p.Name + } + if id == "" { + return "Unknown" + } + return id +} + +type GeometryColumn struct { + Encoding string `json:"encoding"` + GeometryType any `json:"geometry_type,omitempty"` + GeometryTypes any `json:"geometry_types"` + CRS *Proj `json:"crs,omitempty"` + Edges string `json:"edges,omitempty"` + Orientation string `json:"orientation,omitempty"` + Bounds []float64 `json:"bbox,omitempty"` + Epoch float64 `json:"epoch,omitempty"` +} + +func (g *GeometryColumn) clone() *GeometryColumn { + clone := &GeometryColumn{} + *clone = *g + clone.Bounds = make([]float64, len(g.Bounds)) + copy(clone.Bounds, g.Bounds) + return clone +} + +func (col *GeometryColumn) GetGeometryTypes() []string { + if multiType, ok := col.GeometryTypes.([]any); ok { + types := make([]string, len(multiType)) + for i, value := range multiType { + geometryType, ok := value.(string) + if !ok { + return nil + } + types[i] = geometryType + } + return types + } + + if singleType, ok := col.GeometryType.(string); ok { + return []string{singleType} + } + + values, ok := col.GeometryType.([]any) + if !ok { + return nil + } + + types := make([]string, len(values)) + for i, value := range values { + geometryType, ok := value.(string) + if !ok { + return nil + } + types[i] = geometryType + } + + return types +} + +func getDefaultGeometryColumn() *GeometryColumn { + return &GeometryColumn{ + Encoding: DefaultGeometryEncoding, + GeometryTypes: []string{}, + } +} + +func DefaultMetadata() *Metadata { + return &Metadata{ + Version: Version, + PrimaryColumn: DefaultGeometryColumn, + Columns: map[string]*GeometryColumn{ + DefaultGeometryColumn: getDefaultGeometryColumn(), + }, + } +} + +var ErrNoMetadata = fmt.Errorf("missing %s metadata key", MetadataKey) +var ErrDuplicateMetadata = fmt.Errorf("found more than one %s metadata key", MetadataKey) + +func GetMetadata(keyValueMetadata metadata.KeyValueMetadata) (*Metadata, error) { + value, err := GetMetadataValue(keyValueMetadata) + if err != nil { + return nil, err + } + geoFileMetadata := &Metadata{} + jsonErr := json.Unmarshal([]byte(value), geoFileMetadata) + if jsonErr != nil { + return nil, fmt.Errorf("unable to parse %s metadata: %w", MetadataKey, jsonErr) + } + return geoFileMetadata, nil +} + +func GetMetadataValue(keyValueMetadata metadata.KeyValueMetadata) (string, error) { + var value *string + for _, kv := range keyValueMetadata { + if kv.Key == MetadataKey { + if value != nil { + return "", ErrDuplicateMetadata + } + value = kv.Value + } + } + if value == nil { + return "", ErrNoMetadata + } + return *value, nil +} diff --git a/internal/geoparquet/recordreader.go b/internal/geoparquet/recordreader.go new file mode 100644 index 0000000..619aabd --- /dev/null +++ b/internal/geoparquet/recordreader.go @@ -0,0 +1,93 @@ +package geoparquet + +import ( + "context" + "errors" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/pqarrow" + "github.com/apache/arrow/go/v14/parquet/schema" +) + +const ( + defaultReadBatchSize = 1024 +) + +type ReaderConfig struct { + BatchSize int + Reader parquet.ReaderAtSeeker + File *file.Reader + Context context.Context +} + +type RecordReader struct { + fileReader *file.Reader + metadata *Metadata + recordReader pqarrow.RecordReader +} + +func NewRecordReader(config *ReaderConfig) (*RecordReader, error) { + batchSize := config.BatchSize + if batchSize == 0 { + batchSize = defaultReadBatchSize + } + + ctx := config.Context + if ctx == nil { + ctx = context.Background() + } + + fileReader := config.File + if fileReader == nil { + if config.Reader == nil { + return nil, errors.New("config must include a File or Reader value") + } + fr, frErr := file.NewParquetReader(config.Reader) + if frErr != nil { + return nil, frErr + } + fileReader = fr + } + + geoMetadata, geoMetadataErr := GetMetadata(fileReader.MetaData().GetKeyValueMetadata()) + if geoMetadataErr != nil { + return nil, geoMetadataErr + } + + arrowReader, arrowErr := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{BatchSize: int64(batchSize)}, memory.DefaultAllocator) + if arrowErr != nil { + return nil, arrowErr + } + + recordReader, recordErr := arrowReader.GetRecordReader(ctx, nil, nil) + if recordErr != nil { + return nil, recordErr + } + + reader := &RecordReader{ + fileReader: fileReader, + metadata: geoMetadata, + recordReader: recordReader, + } + return reader, nil +} + +func (r *RecordReader) Read() (arrow.Record, error) { + return r.recordReader.Read() +} + +func (r *RecordReader) Metadata() *Metadata { + return r.metadata +} + +func (r *RecordReader) Schema() *schema.Schema { + return r.fileReader.MetaData().Schema +} + +func (r *RecordReader) Close() error { + r.recordReader.Release() + return r.fileReader.Close() +} diff --git a/internal/geoparquet/recordwriter.go b/internal/geoparquet/recordwriter.go new file mode 100644 index 0000000..7612712 --- /dev/null +++ b/internal/geoparquet/recordwriter.go @@ -0,0 +1,81 @@ +package geoparquet + +import ( + "encoding/json" + "errors" + "fmt" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/pqarrow" +) + +type RecordWriter struct { + fileWriter *pqarrow.FileWriter + metadata *Metadata + wroteGeoMetadata bool +} + +func NewRecordWriter(config *WriterConfig) (*RecordWriter, error) { + parquetProps := config.ParquetWriterProps + if parquetProps == nil { + parquetProps = parquet.NewWriterProperties() + } + + arrowProps := config.ArrowWriterProps + if arrowProps == nil { + defaults := pqarrow.DefaultWriterProps() + arrowProps = &defaults + } + + if config.ArrowSchema == nil { + return nil, errors.New("schema is required") + } + + if config.Writer == nil { + return nil, errors.New("writer is required") + } + fileWriter, fileErr := pqarrow.NewFileWriter(config.ArrowSchema, config.Writer, parquetProps, *arrowProps) + if fileErr != nil { + return nil, fileErr + } + + writer := &RecordWriter{ + fileWriter: fileWriter, + metadata: config.Metadata, + } + + return writer, nil +} + +func (w *RecordWriter) AppendKeyValueMetadata(key string, value string) error { + if err := w.fileWriter.AppendKeyValueMetadata(key, value); err != nil { + return err + } + if key == MetadataKey { + w.wroteGeoMetadata = true + } + return nil +} + +func (w *RecordWriter) Write(record arrow.Record) error { + return w.fileWriter.WriteBuffered(record) +} + +func (w *RecordWriter) Close() error { + if !w.wroteGeoMetadata { + metadata := w.metadata + if metadata == nil { + metadata = DefaultMetadata() + } + data, err := json.Marshal(metadata) + if err != nil { + return fmt.Errorf("failed to encode %s file metadata", MetadataKey) + } + if err := w.fileWriter.AppendKeyValueMetadata(MetadataKey, string(data)); err != nil { + return fmt.Errorf("failed to append %s file metadata", MetadataKey) + } + + } + return w.fileWriter.Close() +} diff --git a/internal/geoparquet/writer.go b/internal/geoparquet/writer.go new file mode 100644 index 0000000..4ca8b9c --- /dev/null +++ b/internal/geoparquet/writer.go @@ -0,0 +1,17 @@ +package geoparquet + +import ( + "io" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/pqarrow" +) + +type WriterConfig struct { + Writer io.Writer + Metadata *Metadata + ParquetWriterProps *parquet.WriterProperties + ArrowWriterProps *pqarrow.ArrowWriterProperties + ArrowSchema *arrow.Schema +} diff --git a/internal/pqutil/arrow.go b/internal/pqutil/arrow.go new file mode 100644 index 0000000..eec9450 --- /dev/null +++ b/internal/pqutil/arrow.go @@ -0,0 +1,188 @@ +package pqutil + +import ( + "errors" + "fmt" + "reflect" + "sort" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/planetlabs/gpq/internal/geo" +) + +type ArrowSchemaBuilder struct { + fields map[string]*arrow.Field +} + +func NewArrowSchemaBuilder() *ArrowSchemaBuilder { + return &ArrowSchemaBuilder{ + fields: map[string]*arrow.Field{}, + } +} + +func (b *ArrowSchemaBuilder) Has(name string) bool { + _, has := b.fields[name] + return has +} + +func (b *ArrowSchemaBuilder) AddGeometry(name string, encoding string) error { + var dataType arrow.DataType + switch encoding { + case geo.EncodingWKB: + dataType = arrow.BinaryTypes.Binary + case geo.EncodingWKT: + dataType = arrow.BinaryTypes.String + default: + return fmt.Errorf("unsupported geometry encoding: %s", encoding) + } + b.fields[name] = &arrow.Field{Name: name, Type: dataType, Nullable: true} + return nil +} + +func (b *ArrowSchemaBuilder) Add(record map[string]any) error { + for name, value := range record { + if b.fields[name] != nil { + continue + } + if value == nil { + b.fields[name] = nil + continue + } + if values, ok := value.([]any); ok { + if len(values) == 0 { + b.fields[name] = nil + continue + + } + } + field, err := fieldFromValue(name, value, true) + if err != nil { + return fmt.Errorf("error converting value for %s: %w", name, err) + } + b.fields[name] = field + } + return nil +} + +func fieldFromValue(name string, value any, nullable bool) (*arrow.Field, error) { + switch v := value.(type) { + case bool: + return &arrow.Field{Name: name, Type: arrow.FixedWidthTypes.Boolean, Nullable: nullable}, nil + case int, int64: + return &arrow.Field{Name: name, Type: arrow.PrimitiveTypes.Int64, Nullable: nullable}, nil + case int32: + return &arrow.Field{Name: name, Type: arrow.PrimitiveTypes.Int32, Nullable: nullable}, nil + case float32: + return &arrow.Field{Name: name, Type: arrow.PrimitiveTypes.Float32, Nullable: nullable}, nil + case float64: + return &arrow.Field{Name: name, Type: arrow.PrimitiveTypes.Float64, Nullable: nullable}, nil + case []byte: + return &arrow.Field{Name: name, Type: arrow.BinaryTypes.Binary, Nullable: nullable}, nil + case string: + return &arrow.Field{Name: name, Type: arrow.BinaryTypes.String, Nullable: nullable}, nil + case []any: + if len(v) == 0 { + return nil, nil + } + if err := assertUniformType(v); err != nil { + return nil, err + } + field, err := fieldFromValue(name, v[0], nullable) + if err != nil { + return nil, err + } + return &arrow.Field{Name: name, Type: arrow.ListOf(field.Type), Nullable: nullable}, nil + case map[string]any: + if len(v) == 0 { + return nil, nil + } + return fieldFromMap(name, v, nullable) + default: + return nil, fmt.Errorf("cannot convert value: %v", v) + } +} + +func fieldFromMap(name string, value map[string]any, nullable bool) (*arrow.Field, error) { + keys := sortedKeys(value) + length := len(keys) + fields := make([]arrow.Field, length) + for i, key := range keys { + field, err := fieldFromValue(key, value[key], nullable) + if err != nil { + return nil, fmt.Errorf("trouble generating schema for field %q: %w", key, err) + } + if field == nil { + return nil, nil + } + fields[i] = *field + } + return &arrow.Field{Name: name, Type: arrow.StructOf(fields...), Nullable: nullable}, nil +} + +func assertUniformType(values []any) error { + length := len(values) + if length == 0 { + return errors.New("cannot determine type from zero length slice") + } + mixedTypeErr := errors.New("slices must be of all the same type") + switch v := values[0].(type) { + case bool: + for i := 1; i < length; i += 1 { + if _, ok := values[i].(bool); !ok { + return mixedTypeErr + } + } + case float64: + for i := 1; i < length; i += 1 { + if _, ok := values[i].(float64); !ok { + return mixedTypeErr + } + } + case string: + for i := 1; i < length; i += 1 { + if _, ok := values[i].(string); !ok { + return mixedTypeErr + } + } + default: + t := reflect.TypeOf(v) + for i := 1; i < length; i += 1 { + if reflect.TypeOf(values[i]) != t { + return mixedTypeErr + } + } + } + return nil +} + +func (b *ArrowSchemaBuilder) Ready() bool { + for _, field := range b.fields { + if field == nil { + return false + } + } + return true +} + +func (b *ArrowSchemaBuilder) Schema() (*arrow.Schema, error) { + fields := make([]arrow.Field, len(b.fields)) + for i, name := range sortedKeys(b.fields) { + field := b.fields[name] + if field == nil { + return nil, fmt.Errorf("could not derive type for field: %s", name) + } + fields[i] = *field + } + return arrow.NewSchema(fields, nil), nil +} + +func sortedKeys[V any](m map[string]V) []string { + keys := make([]string, len(m)) + i := 0 + for k := range m { + keys[i] = k + i += 1 + } + sort.Strings(keys) + return keys +} diff --git a/internal/pqutil/arrow_test.go b/internal/pqutil/arrow_test.go new file mode 100644 index 0000000..fc379d0 --- /dev/null +++ b/internal/pqutil/arrow_test.go @@ -0,0 +1,140 @@ +package pqutil_test + +import ( + "fmt" + "testing" + + "github.com/planetlabs/gpq/internal/pqutil" + "github.com/planetlabs/gpq/internal/test" + "github.com/stretchr/testify/require" +) + +func TestBuilder(t *testing.T) { + cases := []struct { + name string + record map[string]any + schema string + }{ + { + name: "flat map", + record: map[string]any{ + "maybe": true, + "answer": 42, + "small": int32(32), + "pi": 4.13, + "data": []byte{'a', 'b', 'c'}, + "good": "yup", + }, + schema: ` + message { + optional int64 answer (INT (64, true)); + optional binary data; + optional binary good (STRING); + optional boolean maybe; + optional double pi; + optional int32 small (INT (32, true)); + } + `, + }, + { + name: "with slices", + record: map[string]any{ + "bools": []any{true, false, true}, + "strings": []any{"chicken", "noodle", "soup"}, + "floats": []any{1.23, 4.56, 7.89}, + "ints": []any{3, 2, 1}, + }, + schema: ` + message { + optional group bools (LIST) { + repeated group list { + optional boolean element; + } + } + optional group floats (LIST) { + repeated group list { + optional double element; + } + } + optional group ints (LIST) { + repeated group list { + optional int64 element (INT (64, true)); + } + } + optional group strings (LIST) { + repeated group list { + optional binary element (STRING); + } + } + } + `, + }, + { + name: "with maps", + record: map[string]any{ + "complex": map[string]any{ + "maybe": true, + "answer": 42, + "small": int32(32), + "pi": 4.13, + "data": []byte{'a', 'b', 'c'}, + "good": "yup", + }, + }, + schema: ` + message { + optional group complex { + optional int64 answer (INT (64, true)); + optional binary data; + optional binary good (STRING); + optional boolean maybe; + optional double pi; + optional int32 small (INT (32, true)); + } + } + `, + }, + { + name: "with slices of maps", + record: map[string]any{ + "things": []any{ + map[string]any{ + "what": "soup", + "cost": 1.00, + }, + map[string]any{ + "what": "car", + "cost": 40000.00, + }, + map[string]any{ + "what": "house", + "cost": 1000000.00, + }, + }, + }, + schema: ` + message { + optional group things (LIST) { + repeated group list { + optional group element { + optional double cost; + optional binary what (STRING); + } + } + } + } + `, + }, + } + + for i, c := range cases { + t.Run(fmt.Sprintf("%s (case %d)", c.name, i), func(t *testing.T) { + b := pqutil.NewArrowSchemaBuilder() + require.NoError(t, b.Add(c.record)) + s, err := b.Schema() + require.NoError(t, err) + require.NotNil(t, s) + test.AssertArrowSchemaMatches(t, c.schema, s) + }) + } +} diff --git a/internal/pqutil/compression.go b/internal/pqutil/compression.go new file mode 100644 index 0000000..c376d4c --- /dev/null +++ b/internal/pqutil/compression.go @@ -0,0 +1,26 @@ +package pqutil + +import ( + "fmt" + + "github.com/apache/arrow/go/v14/parquet/compress" +) + +func GetCompression(codec string) (compress.Compression, error) { + switch codec { + case "uncompressed": + return compress.Codecs.Uncompressed, nil + case "snappy": + return compress.Codecs.Snappy, nil + case "gzip": + return compress.Codecs.Gzip, nil + case "brotli": + return compress.Codecs.Brotli, nil + case "zstd": + return compress.Codecs.Zstd, nil + case "lz4": + return compress.Codecs.Lz4, nil + default: + return compress.Codecs.Uncompressed, fmt.Errorf("invalid compression codec %s", codec) + } +} diff --git a/internal/pqutil/parquet.go b/internal/pqutil/parquet.go new file mode 100644 index 0000000..94601f9 --- /dev/null +++ b/internal/pqutil/parquet.go @@ -0,0 +1,202 @@ +package pqutil + +import ( + "fmt" + "strings" + + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/file" + pqschema "github.com/apache/arrow/go/v14/parquet/schema" +) + +var ParquetStringType = pqschema.StringLogicalType{} + +func GetParquetSchema(input parquet.ReaderAtSeeker) (*pqschema.Schema, error) { + fileReader, err := file.NewParquetReader(input) + if err != nil { + return nil, err + } + schema := fileReader.MetaData().Schema + if err := fileReader.Close(); err != nil { + return nil, err + } + return schema, nil +} + +func LookupNode(schema *pqschema.Schema, name string) (pqschema.Node, bool) { + root := schema.Root() + index := root.FieldIndexByName(name) + if index < 0 { + return nil, false + } + + return root.Field(index), true +} + +func LookupPrimitiveNode(schema *pqschema.Schema, name string) (*pqschema.PrimitiveNode, bool) { + node, ok := LookupNode(schema, name) + if !ok { + return nil, false + } + + primitive, ok := node.(*pqschema.PrimitiveNode) + return primitive, ok +} + +func LookupGroupNode(schema *pqschema.Schema, name string) (*pqschema.GroupNode, bool) { + node, ok := LookupNode(schema, name) + if !ok { + return nil, false + } + + group, ok := node.(*pqschema.GroupNode) + return group, ok +} + +func LookupListElementNode(sc *pqschema.Schema, name string) (*pqschema.PrimitiveNode, bool) { + node, ok := LookupGroupNode(sc, name) + if !ok { + return nil, false + } + + if node.NumFields() != 1 { + return nil, false + } + + group, ok := node.Field(0).(*pqschema.GroupNode) + if !ok { + return nil, false + } + + if group.NumFields() != 1 { + return nil, false + } + + element, ok := group.Field(0).(*pqschema.PrimitiveNode) + return element, ok +} + +// ParquetSchemaString generates a string representation of the schema as documented +// in https://pkg.go.dev/github.com/fraugster/parquet-go/parquetschema +func ParquetSchemaString(schema *pqschema.Schema) string { + w := &parquetWriter{} + return w.String(schema) +} + +type parquetWriter struct { + builder *strings.Builder + err error +} + +func (w *parquetWriter) String(schema *pqschema.Schema) string { + w.builder = &strings.Builder{} + w.err = nil + w.writeSchema(schema) + if w.err != nil { + return w.err.Error() + } + return w.builder.String() +} + +func (w *parquetWriter) writeLine(str string, level int) { + if w.err != nil { + return + } + indent := strings.Repeat(" ", level) + if _, err := w.builder.WriteString(indent + str + "\n"); err != nil { + w.err = err + } +} + +func (w *parquetWriter) writeSchema(schema *pqschema.Schema) { + w.writeLine("message {", 0) + root := schema.Root() + for i := 0; i < root.NumFields(); i += 1 { + w.writeNode(root.Field(i), 1) + } + w.writeLine("}", 0) +} + +func (w *parquetWriter) writeNode(node pqschema.Node, level int) { + switch n := node.(type) { + case *pqschema.GroupNode: + w.writeGroupNode(n, level) + case *pqschema.PrimitiveNode: + w.writePrimitiveNode(n, level) + default: + w.writeLine(fmt.Sprintf("unknown node type: %v", node), level) + } +} + +func (w *parquetWriter) writeGroupNode(node *pqschema.GroupNode, level int) { + repetition := node.RepetitionType().String() + name := node.Name() + annotation := logicalOrConvertedAnnotation(node) + + w.writeLine(fmt.Sprintf("%s group %s%s {", repetition, name, annotation), level) + for i := 0; i < node.NumFields(); i += 1 { + w.writeNode(node.Field(i), level+1) + } + w.writeLine("}", level) +} + +func (w *parquetWriter) writePrimitiveNode(node *pqschema.PrimitiveNode, level int) { + repetition := node.RepetitionType().String() + name := node.Name() + nodeType := physicalTypeString(node.PhysicalType()) + annotation := logicalOrConvertedAnnotation(node) + + w.writeLine(fmt.Sprintf("%s %s %s%s;", repetition, nodeType, name, annotation), level) +} + +func logicalOrConvertedAnnotation(node pqschema.Node) string { + logicalType := node.LogicalType() + convertedType := node.ConvertedType() + + switch t := logicalType.(type) { + case *pqschema.IntLogicalType: + return fmt.Sprintf(" (INT (%d, %t))", t.BitWidth(), t.IsSigned()) + case *pqschema.DecimalLogicalType: + return fmt.Sprintf(" (DECIMAL (%d, %d))", t.Precision(), t.Scale()) + case *pqschema.TimestampLogicalType: + var unit string + switch t.TimeUnit() { + case pqschema.TimeUnitMillis: + unit = "MILLIS" + case pqschema.TimeUnitMicros: + unit = "MICROS" + case pqschema.TimeUnitNanos: + unit = "NANOS" + default: + unit = "UNKNOWN" + } + return fmt.Sprintf(" (TIMESTAMP (%s, %t))", unit, t.IsAdjustedToUTC()) + } + + var annotation string + _, invalid := logicalType.(pqschema.UnknownLogicalType) + _, none := logicalType.(pqschema.NoLogicalType) + + if logicalType != nil && !invalid && !none { + annotation = fmt.Sprintf(" (%s)", strings.ToUpper(logicalType.String())) + } else if convertedType != pqschema.ConvertedTypes.None { + annotation = fmt.Sprintf(" (%s)", strings.ToUpper(convertedType.String())) + } + + return annotation +} + +var physicalTypeLookup = map[string]string{ + "byte_array": "binary", +} + +func physicalTypeString(physical parquet.Type) string { + nodeType := strings.ToLower(physical.String()) + if altType, ok := physicalTypeLookup[nodeType]; ok { + return altType + } + if physical == parquet.Types.FixedLenByteArray { + nodeType += fmt.Sprintf(" (%d)", physical.ByteSize()) + } + return nodeType +} diff --git a/internal/pqutil/parquet_test.go b/internal/pqutil/parquet_test.go new file mode 100644 index 0000000..91d4945 --- /dev/null +++ b/internal/pqutil/parquet_test.go @@ -0,0 +1,132 @@ +package pqutil_test + +import ( + "fmt" + "testing" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/parquet/pqarrow" + "github.com/planetlabs/gpq/internal/pqutil" + "github.com/planetlabs/gpq/internal/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestArrowToParquetString(t *testing.T) { + cases := []struct { + name string + schema *arrow.Schema + expected string + }{ + { + name: "basic", + schema: arrow.NewSchema([]arrow.Field{ + {Name: "optional_bytes", Type: arrow.BinaryTypes.Binary, Nullable: true}, + {Name: "optional_float32", Type: arrow.PrimitiveTypes.Float32, Nullable: true}, + {Name: "optional_float64", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, + {Name: "optional_int32", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, + {Name: "optional_int64", Type: arrow.PrimitiveTypes.Int64, Nullable: true}, + {Name: "optional_bool", Type: arrow.FixedWidthTypes.Boolean, Nullable: true}, + {Name: "required_bool", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + {Name: "optional_string", Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "required_fixed_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}, Nullable: false}, + }, nil), + expected: ` + message { + optional binary optional_bytes; + optional float optional_float32; + optional double optional_float64; + optional int32 optional_int32 (INT (32, true)); + optional int64 optional_int64 (INT (64, true)); + optional boolean optional_bool; + required boolean required_bool; + optional binary optional_string (STRING); + required fixed_len_byte_array (24) required_fixed_binary; + } + `, + }, + { + name: "lists", + schema: arrow.NewSchema([]arrow.Field{ + {Name: "optional_bools", Type: arrow.ListOf(arrow.FixedWidthTypes.Boolean), Nullable: true}, + {Name: "required_nullable_strings", Type: arrow.ListOf(arrow.BinaryTypes.String), Nullable: false}, + }, nil), + expected: ` + message { + optional group optional_bools (LIST) { + repeated group list { + optional boolean element; + } + } + required group required_nullable_strings (LIST) { + repeated group list { + optional binary element (STRING); + } + } + } + `, + }, + { + name: "TODO: ticket this issue with non-nullable list items", + schema: arrow.NewSchema([]arrow.Field{ + {Name: "optional_nonnullable_bools", Type: arrow.ListOfNonNullable(arrow.FixedWidthTypes.Boolean), Nullable: false}, + }, nil), + expected: ` + message { + required group optional_nonnullable_bools (LIST) { + repeated group list { + optional boolean element; + } + } + } + `, + }, + { + name: "structs", + schema: arrow.NewSchema([]arrow.Field{ + {Name: "soup", Type: arrow.StructOf( + arrow.Field{Name: "good", Type: arrow.FixedWidthTypes.Boolean, Nullable: false}, + arrow.Field{Name: "helpings", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, + ), Nullable: false}, + }, nil), + expected: ` + message { + required group soup { + required boolean good; + optional double helpings; + } + } + `, + }, + { + name: "lists of structs", + schema: arrow.NewSchema([]arrow.Field{ + {Name: "things", Type: arrow.ListOf(arrow.StructOf( + arrow.Field{Name: "name", Type: arrow.BinaryTypes.String, Nullable: false}, + arrow.Field{Name: "cost", Type: arrow.PrimitiveTypes.Float64, Nullable: true}, + )), Nullable: true}, + }, nil), + expected: ` + message { + optional group things (LIST) { + repeated group list { + optional group element { + required binary name (STRING); + optional double cost; + } + } + } + } + `, + }, + } + + for i, c := range cases { + t.Run(fmt.Sprintf("%s (case %d)", c.name, i), func(t *testing.T) { + parquetSchema, err := pqarrow.ToParquet(c.schema, nil, pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + assert.Equal(t, test.Tab2Space(test.Dedent(c.expected)), pqutil.ParquetSchemaString(parquetSchema)) + }) + } +} diff --git a/internal/pqutil/transform.go b/internal/pqutil/transform.go new file mode 100644 index 0000000..1a4c059 --- /dev/null +++ b/internal/pqutil/transform.go @@ -0,0 +1,142 @@ +package pqutil + +import ( + "context" + "errors" + "fmt" + "io" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/compress" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/pqarrow" + "github.com/apache/arrow/go/v14/parquet/schema" +) + +type ColumnTransformer func(*arrow.Field, *arrow.Field, *arrow.Chunked) (*arrow.Chunked, error) + +type SchemaTransformer func(*schema.Schema) (*schema.Schema, error) + +type TransformConfig struct { + Reader parquet.ReaderAtSeeker + Writer io.Writer + Compression *compress.Compression + TransformSchema SchemaTransformer + TransformColumn ColumnTransformer + BeforeClose func(*file.Writer) error +} + +func getWriterProperties(config *TransformConfig, fileReader *file.Reader) (*parquet.WriterProperties, error) { + var writerProperties []parquet.WriterProperty + if config.Compression != nil { + writerProperties = append(writerProperties, parquet.WithCompression(*config.Compression)) + } + // retain existing column compression (from the first row group) + if fileReader.NumRowGroups() > 0 { + rowGroupMetadata := fileReader.RowGroup(0).MetaData() + for colNum := 0; colNum < rowGroupMetadata.NumColumns(); colNum += 1 { + colChunkMetadata, err := rowGroupMetadata.ColumnChunk(colNum) + if err != nil { + return nil, fmt.Errorf("failed to get column chunk metadata for column %d", colNum) + } + compression := colChunkMetadata.Compression() + if compression != compress.Codecs.Uncompressed { + colPath := colChunkMetadata.PathInSchema() + writerProperties = append(writerProperties, parquet.WithCompressionPath(colPath, compression)) + } + } + } + + return parquet.NewWriterProperties(writerProperties...), nil +} + +func TransformByColumn(config *TransformConfig) error { + if config.Reader == nil { + return errors.New("reader is required") + } + if config.Writer == nil { + return errors.New("writer is required") + } + + fileReader, fileReaderErr := file.NewParquetReader(config.Reader) + if fileReaderErr != nil { + return fileReaderErr + } + defer fileReader.Close() + inputSchema := fileReader.MetaData().Schema + + outputSchema := inputSchema + if config.TransformSchema != nil { + schema, err := config.TransformSchema(inputSchema) + if err != nil { + return err + } + outputSchema = schema + } + + arrowReadProperties := pqarrow.ArrowReadProperties{} + + arrowReader, arrowError := pqarrow.NewFileReader(fileReader, arrowReadProperties, memory.DefaultAllocator) + if arrowError != nil { + return arrowError + } + inputManifest := arrowReader.Manifest + + outputManifest, manifestErr := pqarrow.NewSchemaManifest(outputSchema, fileReader.MetaData().KeyValueMetadata(), &arrowReadProperties) + if manifestErr != nil { + return manifestErr + } + + numFields := len(outputManifest.Fields) + if numFields != len(inputManifest.Fields) { + return fmt.Errorf("unexpected number of fields in the output schema, got %d, expected %d", numFields, len(inputManifest.Fields)) + } + + writerProperties, propErr := getWriterProperties(config, fileReader) + if propErr != nil { + return propErr + } + + fileWriter := file.NewParquetWriter(config.Writer, outputSchema.Root(), file.WithWriterProps(writerProperties)) + defer fileWriter.Close() + + ctx := pqarrow.NewArrowWriteContext(context.Background(), nil) + + numRowGroups := fileReader.NumRowGroups() + for rowGroupIndex := 0; rowGroupIndex < numRowGroups; rowGroupIndex += 1 { + rowGroupReader := arrowReader.RowGroup(rowGroupIndex) + rowGroupWriter := fileWriter.AppendRowGroup() + for fieldNum := 0; fieldNum < numFields; fieldNum += 1 { + arr, readErr := rowGroupReader.Column(fieldNum).Read(ctx) + if readErr != nil { + return readErr + } + if config.TransformColumn != nil { + inputField := inputManifest.Fields[fieldNum].Field + outputField := outputManifest.Fields[fieldNum].Field + transformed, err := config.TransformColumn(inputField, outputField, arr) + if err != nil { + return err + } + if transformed.DataType() != outputField.Type { + return fmt.Errorf("transform generated an unexpected type, got %s, expected %s", transformed.DataType().Name(), outputField.Type.Name()) + } + arr = transformed + } + colWriter, colWriterErr := pqarrow.NewArrowColumnWriter(arr, 0, int64(arr.Len()), outputManifest, rowGroupWriter, fieldNum) + if colWriterErr != nil { + return colWriterErr + } + if err := colWriter.Write(ctx); err != nil { + return err + } + } + } + + if config.BeforeClose != nil { + return config.BeforeClose(fileWriter) + } + return nil +} diff --git a/internal/pqutil/transform_test.go b/internal/pqutil/transform_test.go new file mode 100644 index 0000000..f302894 --- /dev/null +++ b/internal/pqutil/transform_test.go @@ -0,0 +1,214 @@ +package pqutil_test + +import ( + "bytes" + "fmt" + "strconv" + "testing" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/compress" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/schema" + "github.com/planetlabs/gpq/internal/pqutil" + "github.com/planetlabs/gpq/internal/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTransformByColumn(t *testing.T) { + cases := []struct { + name string + data string + config *pqutil.TransformConfig + }{ + { + name: "basics", + data: `[ + { + "product": "soup", + "cost": 1.29 + }, + { + "product": "747", + "cost": 100000000 + } + ]`, + }, + { + name: "repeated values", + data: `[ + { + "name": "Taylor", + "grades": ["A", "B", "C"] + }, + { + "name": "Kai", + "grades": ["C", "B", "A"] + } + ]`, + }, + { + name: "with snappy compression", + data: `[ + { + "number": 42 + }, + { + "number": 3.14 + } + ]`, + config: &pqutil.TransformConfig{ + Compression: &compress.Codecs.Snappy, + }, + }, + { + name: "with gzip compression", + data: `[ + { + "number": 42 + }, + { + "number": 3.14 + } + ]`, + config: &pqutil.TransformConfig{ + Compression: &compress.Codecs.Gzip, + }, + }, + } + + for i, c := range cases { + t.Run(fmt.Sprintf("%s (case %d)", c.name, i), func(t *testing.T) { + input := test.ParquetFromJSON(t, c.data) + output := &bytes.Buffer{} + config := c.config + if config == nil { + config = &pqutil.TransformConfig{} + } + config.Reader = input + config.Writer = output + + require.NoError(t, pqutil.TransformByColumn(config)) + + outputAsJSON := test.ParquetToJSON(t, bytes.NewReader(output.Bytes())) + assert.JSONEq(t, c.data, outputAsJSON) + + if c.config == nil { + return + } + + fileReader, err := file.NewParquetReader(bytes.NewReader(output.Bytes())) + require.NoError(t, err) + defer fileReader.Close() + + if c.config.Compression != nil { + expected := *c.config.Compression + require.Greater(t, fileReader.NumRowGroups(), 0) + rowGroupMetadata := fileReader.RowGroup(0).MetaData() + numColumns := rowGroupMetadata.NumColumns() + assert.Greater(t, numColumns, 0) + for colNum := 0; colNum < numColumns; colNum += 1 { + columnChunk, err := rowGroupMetadata.ColumnChunk(colNum) + require.NoError(t, err) + assert.Equal(t, expected, columnChunk.Compression()) + } + } + }) + } +} + +func TestTransformColumn(t *testing.T) { + data := `[ + { + "product": "soup", + "cost": "1.29" + }, + { + "product": "747", + "cost": "100000000" + } + ]` + + expected := `[ + { + "product": "soup", + "cost": 1.29 + }, + { + "product": "747", + "cost": 100000000 + } + ]` + + transformSchema := func(inputSchema *schema.Schema) (*schema.Schema, error) { + inputRoot := inputSchema.Root() + numFields := inputRoot.NumFields() + + fields := make([]schema.Node, numFields) + for fieldNum := 0; fieldNum < numFields; fieldNum += 1 { + inputField := inputRoot.Field(fieldNum) + if inputField.Name() != "cost" { + fields[fieldNum] = inputField + continue + } + outputField, err := schema.NewPrimitiveNode(inputField.Name(), inputField.RepetitionType(), parquet.Types.Double, -1, -1) + if err != nil { + return nil, err + } + fields[fieldNum] = outputField + } + + outputRoot, err := schema.NewGroupNode(inputRoot.Name(), inputRoot.RepetitionType(), fields, -1) + if err != nil { + return nil, err + } + return schema.NewSchema(outputRoot), nil + } + + transformColumn := func(inputField *arrow.Field, outputField *arrow.Field, chunked *arrow.Chunked) (*arrow.Chunked, error) { + if inputField.Name != "cost" { + return chunked, nil + } + chunks := chunked.Chunks() + transformed := make([]arrow.Array, len(chunks)) + builder := array.NewFloat64Builder(memory.DefaultAllocator) + defer builder.Release() + for i, arr := range chunks { + stringArray, ok := arr.(*array.String) + if !ok { + return nil, fmt.Errorf("expected a string array, got %v", arr) + } + for rowNum := 0; rowNum < stringArray.Len(); rowNum += 1 { + if outputField.Nullable && stringArray.IsNull(rowNum) { + builder.AppendNull() + continue + } + str := stringArray.Value(rowNum) + value, err := strconv.ParseFloat(str, 64) + if err != nil { + return nil, fmt.Errorf("trouble parsing %q as float: %w", str, err) + } + builder.Append(value) + } + transformed[i] = builder.NewArray() + } + return arrow.NewChunked(builder.Type(), transformed), nil + } + + input := test.ParquetFromJSON(t, data) + output := &bytes.Buffer{} + config := &pqutil.TransformConfig{ + Reader: input, + TransformSchema: transformSchema, + TransformColumn: transformColumn, + Writer: output, + } + require.NoError(t, pqutil.TransformByColumn(config)) + + outputAsJSON := test.ParquetToJSON(t, bytes.NewReader(output.Bytes())) + assert.JSONEq(t, expected, outputAsJSON) +} diff --git a/internal/test/test.go b/internal/test/test.go new file mode 100644 index 0000000..0921162 --- /dev/null +++ b/internal/test/test.go @@ -0,0 +1,187 @@ +package test + +import ( + "bytes" + "context" + "encoding/json" + "io" + "strings" + "testing" + + "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/pqarrow" + "github.com/apache/arrow/go/v14/parquet/schema" + "github.com/planetlabs/gpq/internal/geoparquet" + "github.com/planetlabs/gpq/internal/pqutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func ParquetFromJSON(t *testing.T, data string) parquet.ReaderAtSeeker { + var rows []map[string]any + require.NoError(t, json.Unmarshal([]byte(data), &rows)) + + builder := pqutil.NewArrowSchemaBuilder() + for _, row := range rows { + require.NoError(t, builder.Add(row)) + } + if !builder.Ready() { + assert.Fail(t, "could not derive schema from rows") + } + + schema, err := builder.Schema() + require.NoError(t, err) + + rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(string(data))) + require.NoError(t, err) + + output := &bytes.Buffer{} + + writer, err := pqarrow.NewFileWriter(schema, output, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.WriteBuffered(rec)) + require.NoError(t, writer.Close()) + + return bytes.NewReader(output.Bytes()) +} + +func ParquetToJSON(t *testing.T, input parquet.ReaderAtSeeker) string { + fileReader, err := file.NewParquetReader(input) + require.NoError(t, err) + + arrowReader, err := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator) + require.NoError(t, err) + + recordReader, err := arrowReader.GetRecordReader(context.Background(), nil, nil) + require.NoError(t, err) + + rows := []map[string]any{} + + for { + record, err := recordReader.Read() + if err == io.EOF { + break + } + require.NoError(t, err) + + schema := record.Schema() + arr := array.RecordToStructArray(record) + defer arr.Release() + + for rowNum := 0; rowNum < arr.Len(); rowNum += 1 { + row := map[string]any{} + for fieldNum := 0; fieldNum < arr.NumField(); fieldNum += 1 { + name := schema.Field(fieldNum).Name + value := arr.Field(fieldNum).GetOneForMarshal(rowNum) + row[name] = value + } + rows = append(rows, row) + } + } + + data, err := json.Marshal(rows) + require.NoError(t, err) + return string(data) +} + +func GeoParquetFromStructs[T any](t *testing.T, rows []T, metadata *geoparquet.Metadata) parquet.ReaderAtSeeker { + parquetSchema, err := schema.NewSchemaFromStruct(rows[0]) + require.NoError(t, err) + + arrowSchema, err := pqarrow.FromParquet(parquetSchema, nil, nil) + require.NoError(t, err) + + output := &bytes.Buffer{} + recordWriter, err := geoparquet.NewRecordWriter(&geoparquet.WriterConfig{ + Writer: output, + Metadata: metadata, + ArrowSchema: arrowSchema, + }) + require.NoError(t, err) + + data, err := json.Marshal(rows) + require.NoError(t, err) + + rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, arrowSchema, strings.NewReader(string(data))) + require.NoError(t, err) + + require.NoError(t, recordWriter.Write(rec)) + require.NoError(t, recordWriter.Close()) + + return bytes.NewReader(output.Bytes()) +} + +func ParquetFromStructs[T any](t *testing.T, rows []T) parquet.ReaderAtSeeker { + parquetSchema, err := schema.NewSchemaFromStruct(rows[0]) + require.NoError(t, err) + + arrowSchema, err := pqarrow.FromParquet(parquetSchema, nil, nil) + require.NoError(t, err) + + data, err := json.Marshal(rows) + require.NoError(t, err) + + rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, arrowSchema, strings.NewReader(string(data))) + require.NoError(t, err) + + output := &bytes.Buffer{} + + writer, err := pqarrow.NewFileWriter(arrowSchema, output, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + require.NoError(t, writer.WriteBuffered(rec)) + require.NoError(t, writer.Close()) + + return bytes.NewReader(output.Bytes()) +} + +func AssertArrowSchemaMatches(t *testing.T, expected string, schema *arrow.Schema) { + parquetSchema, err := pqarrow.ToParquet(schema, nil, pqarrow.DefaultWriterProps()) + require.NoError(t, err) + + assert.Equal(t, Tab2Space(Dedent(expected)), pqutil.ParquetSchemaString(parquetSchema)) +} + +func Dedent(block string) string { + newline := "\n" + whitespace := " \t" + + lines := strings.Split(block, newline) + prefixLen := -1 + + if len(lines) == 0 { + return block + } + + if len(strings.TrimLeft(lines[0], whitespace)) == 0 { + lines = lines[1:] + } + if len(strings.TrimLeft(lines[len(lines)-1], whitespace)) == 0 { + lines = lines[:len(lines)-1] + } + + dedentedLines := []string{} + for _, line := range lines { + if prefixLen < 0 { + trimmedLine := strings.TrimLeft(line, whitespace) + prefixLen = len(line) - len(trimmedLine) + dedentedLines = append(dedentedLines, trimmedLine) + continue + } + if prefixLen > len(line)-1 { + dedentedLines = append(dedentedLines, strings.TrimLeft(line, whitespace)) + continue + } + dedentedLines = append(dedentedLines, line[prefixLen:]) + } + return strings.Join(dedentedLines, newline) + newline +} + +func Tab2Space(str string) string { + return strings.ReplaceAll(str, "\t", " ") +} diff --git a/internal/validator/rules.go b/internal/validator/rules.go index 63567f7..779f6eb 100644 --- a/internal/validator/rules.go +++ b/internal/validator/rules.go @@ -19,10 +19,13 @@ import ( "errors" "fmt" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/apache/arrow/go/v14/parquet/schema" "github.com/paulmach/orb" + "github.com/planetlabs/gpq/internal/geo" "github.com/planetlabs/gpq/internal/geoparquet" "github.com/santhosh-tekuri/jsonschema/v5" - "github.com/segmentio/parquet-go" ) type MetadataMap map[string]any @@ -30,19 +33,12 @@ type MetadataMap map[string]any type ColumnMetdataMap map[string]map[string]any type FileInfo struct { - File *parquet.File + File *file.Reader Metadata *geoparquet.Metadata } type RuleData interface { - *parquet.File | MetadataMap | ColumnMetdataMap | *FileInfo -} - -type EncodedGeometryMap map[string]any -type DecodedGeometryMap map[string]orb.Geometry - -type RowData interface { - EncodedGeometryMap | DecodedGeometryMap + *file.Reader | MetadataMap | ColumnMetdataMap | *FileInfo } type Rule interface { @@ -73,7 +69,7 @@ type GenericRule[T RuleData] struct { validate func(T) error } -var _ Rule = (*GenericRule[*parquet.File])(nil) +var _ Rule = (*GenericRule[*file.Reader])(nil) func (r *GenericRule[T]) Title() string { return r.title @@ -87,31 +83,31 @@ func (r *GenericRule[T]) Validate() error { return r.validate(r.value) } -type RowRule[T RowData] struct { +type ColumnValueRule[T any] struct { title string - row func(*FileInfo, T) error + value func(*FileInfo, string, T) error info *FileInfo err error } -var _ Rule = (*RowRule[EncodedGeometryMap])(nil) +var _ Rule = (*ColumnValueRule[*string])(nil) -func (r *RowRule[T]) Title() string { +func (r *ColumnValueRule[T]) Title() string { return r.title } -func (r *RowRule[T]) Init(info *FileInfo) { +func (r *ColumnValueRule[T]) Init(info *FileInfo) { r.info = info } -func (r *RowRule[T]) Row(data T) error { +func (r *ColumnValueRule[T]) Value(name string, data T) error { if r.err == nil { - r.err = r.row(r.info, data) + r.err = r.value(r.info, name, data) } return r.err } -func (r *RowRule[T]) Validate() error { +func (r *ColumnValueRule[T]) Validate() error { return r.err } @@ -124,11 +120,11 @@ func asJSON(value any) string { } func RequiredGeoKey() Rule { - return &GenericRule[*parquet.File]{ + return &GenericRule[*file.Reader]{ title: fmt.Sprintf("file must include a %q metadata key", geoparquet.MetadataKey), - validate: func(file *parquet.File) error { - _, ok := file.Lookup(geoparquet.MetadataKey) - if !ok { + validate: func(file *file.Reader) error { + kv := file.MetaData().KeyValueMetadata() + if kv.FindValue(geoparquet.MetadataKey) == nil { return fatal("missing %q metadata key", geoparquet.MetadataKey) } return nil @@ -137,10 +133,10 @@ func RequiredGeoKey() Rule { } func RequiredMetadataType() Rule { - return &GenericRule[*parquet.File]{ + return &GenericRule[*file.Reader]{ title: "metadata must be a JSON object", - validate: func(file *parquet.File) error { - value, geoErr := geoparquet.GetMetadataValue(file) + validate: func(file *file.Reader) error { + value, geoErr := geoparquet.GetMetadataValue(file.MetaData().KeyValueMetadata()) if geoErr != nil { return fatal(geoErr.Error()) } @@ -228,7 +224,7 @@ func RequiredColumnEncoding() Rule { if !ok { return fmt.Errorf(`expected "encoding" for column %q to be a string, got %s`, name, asJSON(meta["encoding"])) } - if encoding != geoparquet.EncodingWKB { + if encoding != geoparquet.DefaultGeometryEncoding { return fmt.Errorf(`unsupported encoding %q for column %q`, encoding, name) } } @@ -431,19 +427,20 @@ func PrimaryColumnInLookup() Rule { } } -func GeometryDataType() Rule { +func GeometryUngrouped() Rule { return &GenericRule[*FileInfo]{ - title: "geometry columns must be stored using the BYTE_ARRAY parquet type", + title: "geometry columns must not be grouped", validate: func(info *FileInfo) error { metadata := info.Metadata - schema := info.File.Schema() + sc := info.File.MetaData().Schema for name := range metadata.Columns { - column, ok := schema.Lookup(name) - if !ok { + index := sc.ColumnIndexByName(name) + if index < 0 { return fatal("missing geometry column %q", name) } - if column.Node.Type() != parquet.ByteArrayType { - return fatal("unexpected type for column %q, got %s", name, column.Node.Type()) + _, ok := sc.Root().Field(index).(*schema.PrimitiveNode) + if !ok { + return fmt.Errorf("column %q must not be a group", name) } } @@ -452,19 +449,24 @@ func GeometryDataType() Rule { } } -func GeometryUngrouped() Rule { +func GeometryDataType() Rule { return &GenericRule[*FileInfo]{ - title: "geometry columns must not be grouped", + title: "geometry columns must be stored using the BYTE_ARRAY parquet type", validate: func(info *FileInfo) error { metadata := info.Metadata - schema := info.File.Schema() + sc := info.File.MetaData().Schema for name := range metadata.Columns { - column, ok := schema.Lookup(name) - if !ok { + index := sc.ColumnIndexByName(name) + if index < 0 { return fatal("missing geometry column %q", name) } - if !column.Node.Leaf() { - return fmt.Errorf("column %q must not be a group", name) + + field, ok := sc.Root().Field(index).(*schema.PrimitiveNode) + if !ok { + return fatal("expected primitive column for %q", name) + } + if field.PhysicalType() != parquet.Types.ByteArray { + return fatal("unexpected type for column %q, got %s", name, field.PhysicalType()) } } @@ -478,16 +480,18 @@ func GeometryRepetition() Rule { title: "geometry columns must be required or optional, not repeated", validate: func(info *FileInfo) error { metadata := info.Metadata - schema := info.File.Schema() + sc := info.File.MetaData().Schema for name := range metadata.Columns { - column, ok := schema.Lookup(name) - if !ok { + index := sc.ColumnIndexByName(name) + if index < 0 { return fatal("missing geometry column %q", name) } - if column.Node.Repeated() { + + repetitionType := sc.Root().Field(index).RepetitionType() + if repetitionType == parquet.Repetitions.Repeated { return fmt.Errorf("column %q must not be repeated", name) } - if !column.Node.Required() && !column.Node.Optional() { + if repetitionType != parquet.Repetitions.Required && repetitionType != parquet.Repetitions.Optional { return fmt.Errorf("column %q must be required or optional", name) } } @@ -498,17 +502,16 @@ func GeometryRepetition() Rule { } func GeometryEncoding() Rule { - return &RowRule[EncodedGeometryMap]{ + return &ColumnValueRule[any]{ title: `all geometry values match the "encoding" metadata`, - row: func(info *FileInfo, geometries EncodedGeometryMap) error { - schema := info.File.Schema() - metadata := info.Metadata - - for name, encoded := range geometries { - _, _, err := geoparquet.Geometry(encoded, name, metadata, schema) - if err != nil { - return fatal("invalid geometry in column %q: %s", name, err) - } + value: func(info *FileInfo, name string, data any) error { + geomColumn := info.Metadata.Columns[name] + if geomColumn == nil { + return fatal("missing geometry column %q", name) + } + _, err := geo.DecodeGeometry(data, geomColumn.Encoding) + if err != nil { + return fatal("invalid geometry in column %q: %s", name, err) } return nil @@ -517,32 +520,29 @@ func GeometryEncoding() Rule { } func GeometryTypes() Rule { - return &RowRule[DecodedGeometryMap]{ + return &ColumnValueRule[orb.Geometry]{ title: `all geometry types must be included in the "geometry_types" metadata (if not empty)`, - row: func(info *FileInfo, geometries DecodedGeometryMap) error { - metadata := info.Metadata + value: func(info *FileInfo, name string, geometry orb.Geometry) error { + geomColumn := info.Metadata.Columns[name] + if geomColumn == nil { + return fatal("missing geometry column %q", name) + } - for name, geometry := range geometries { - meta, ok := metadata.Columns[name] - if !ok { - return fatal("missing metadata for column %q", name) - } - geometryTypes := meta.GetGeometryTypes() - if len(geometryTypes) == 0 { - continue - } - actualType := geometry.GeoJSONType() - included := false - for _, expectedType := range geometryTypes { - if actualType == expectedType || actualType+" Z" == expectedType { - included = true - break - } - } - if !included { - return fmt.Errorf("unexpected geometry type %q for column %q", actualType, name) + geometryTypes := geomColumn.GetGeometryTypes() + if len(geometryTypes) == 0 { + return nil + } + actualType := geometry.GeoJSONType() + included := false + for _, expectedType := range geometryTypes { + if actualType == expectedType || actualType+" Z" == expectedType { + included = true + break } } + if !included { + return fmt.Errorf("unexpected geometry type %q for column %q", actualType, name) + } return nil }, @@ -550,41 +550,38 @@ func GeometryTypes() Rule { } func GeometryOrientation() Rule { - return &RowRule[DecodedGeometryMap]{ + return &ColumnValueRule[orb.Geometry]{ title: `all polygon geometries must follow the "orientation" metadata (if present)`, - row: func(info *FileInfo, geometries DecodedGeometryMap) error { - metadata := info.Metadata + value: func(info *FileInfo, name string, geometry orb.Geometry) error { + geomColumn := info.Metadata.Columns[name] + if geomColumn == nil { + return fatal("missing geometry column %q", name) + } - for name, geometry := range geometries { - meta, ok := metadata.Columns[name] - if !ok { - return fatal("missing metadata for column %q", name) - } - if meta.Orientation == "" { - continue - } - if meta.Orientation != geoparquet.OrientationCounterClockwise { - return fmt.Errorf("unsupported orientation %q for column %q", meta.Orientation, name) - } - polygon, ok := geometry.(orb.Polygon) - if !ok { - continue - } + if geomColumn.Orientation == "" { + return nil + } + if geomColumn.Orientation != geoparquet.OrientationCounterClockwise { + return fmt.Errorf("unsupported orientation %q for column %q", geomColumn.Orientation, name) + } + polygon, ok := geometry.(orb.Polygon) + if !ok { + return nil + } - expectedExterior := orb.CCW - expectedInterior := orb.CW + expectedExterior := orb.CCW + expectedInterior := orb.CW - for i, ring := range polygon { - orientation := ring.Orientation() - if i == 0 { - if orientation != expectedExterior { - return fmt.Errorf("invalid orientation for exterior ring in column %q", name) - } - continue - } - if orientation != expectedInterior { - return fmt.Errorf("invalid orientation for interior ring in column %q", name) + for i, ring := range polygon { + orientation := ring.Orientation() + if i == 0 { + if orientation != expectedExterior { + return fmt.Errorf("invalid orientation for exterior ring in column %q", name) } + continue + } + if orientation != expectedInterior { + return fmt.Errorf("invalid orientation for interior ring in column %q", name) } } @@ -594,64 +591,61 @@ func GeometryOrientation() Rule { } func GeometryBounds() Rule { - return &RowRule[DecodedGeometryMap]{ + return &ColumnValueRule[orb.Geometry]{ title: `all geometries must fall within the "bbox" metadata (if present)`, - row: func(info *FileInfo, geometries DecodedGeometryMap) error { - metadata := info.Metadata + value: func(info *FileInfo, name string, geometry orb.Geometry) error { + geomColumn := info.Metadata.Columns[name] + if geomColumn == nil { + return fatal("missing geometry column %q", name) + } - for name, geometry := range geometries { - meta, ok := metadata.Columns[name] - if !ok { - return fatal("missing metadata for column %q", name) - } - bbox := meta.Bounds - length := len(bbox) - if length == 0 { - continue + bbox := geomColumn.Bounds + length := len(bbox) + if length == 0 { + return nil + } + var x0 float64 + var x1 float64 + var y0 float64 + var y1 float64 + if length == 4 { + x0 = bbox[0] + y0 = bbox[1] + x1 = bbox[2] + y1 = bbox[3] + } else if length == 6 { + x0 = bbox[0] + y0 = bbox[1] + x1 = bbox[3] + y1 = bbox[4] + } else { + return fmt.Errorf("invalid bbox length for column %q", name) + } + + bound := geometry.Bound() + if x0 <= x1 { + // bbox does not cross the antimeridian + if bound.Min.X() < x0 { + return fmt.Errorf("geometry in column %q extends to %f, west of the bbox", name, bound.Min.X()) } - var x0 float64 - var x1 float64 - var y0 float64 - var y1 float64 - if length == 4 { - x0 = bbox[0] - y0 = bbox[1] - x1 = bbox[2] - y1 = bbox[3] - } else if length == 6 { - x0 = bbox[0] - y0 = bbox[1] - x1 = bbox[3] - y1 = bbox[4] - } else { - return fmt.Errorf("invalid bbox length for column %q", name) - } - - bound := geometry.Bound() - if x0 <= x1 { - // bbox does not cross the antimeridian - if bound.Min.X() < x0 { - return fmt.Errorf("geometry in column %q extends to %f, west of the bbox", name, bound.Min.X()) - } - if bound.Max.X() > x1 { - return fmt.Errorf("geometry in column %q extends to %f, east of the bbox", name, bound.Max.X()) - } - } else { - // bbox crosses the antimeridian - if bound.Max.X() > x1 && bound.Max.X() < x0 { - return fmt.Errorf("geometry in column %q extends to %f, outside of the bbox", name, bound.Max.X()) - } - if bound.Min.X() < x0 && bound.Min.X() > x1 { - return fmt.Errorf("geometry in column %q extends to %f, outside of the bbox", name, bound.Min.X()) - } + if bound.Max.X() > x1 { + return fmt.Errorf("geometry in column %q extends to %f, east of the bbox", name, bound.Max.X()) } - if bound.Min.Y() < y0 { - return fmt.Errorf("geometry in column %q extends to %f, south of the bbox", name, bound.Min.Y()) + } else { + // bbox crosses the antimeridian + if bound.Max.X() > x1 && bound.Max.X() < x0 { + return fmt.Errorf("geometry in column %q extends to %f, outside of the bbox", name, bound.Max.X()) } - if bound.Max.Y() > y1 { - return fmt.Errorf("geometry in column %q extends to %f, north of the bbox", name, bound.Max.Y()) + if bound.Min.X() < x0 && bound.Min.X() > x1 { + return fmt.Errorf("geometry in column %q extends to %f, outside of the bbox", name, bound.Min.X()) } } + if bound.Min.Y() < y0 { + return fmt.Errorf("geometry in column %q extends to %f, south of the bbox", name, bound.Min.Y()) + } + if bound.Max.Y() > y1 { + return fmt.Errorf("geometry in column %q extends to %f, north of the bbox", name, bound.Max.Y()) + } return nil }, diff --git a/internal/validator/testdata/all-pass-meta/expected.json b/internal/validator/testdata/all-pass-meta/expected.json index 69b2217..5160470 100644 --- a/internal/validator/testdata/all-pass-meta/expected.json +++ b/internal/validator/testdata/all-pass-meta/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/all-pass-minimal/expected.json b/internal/validator/testdata/all-pass-minimal/expected.json index 2b2da11..313c688 100644 --- a/internal/validator/testdata/all-pass-minimal/expected.json +++ b/internal/validator/testdata/all-pass-minimal/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/all-pass/expected.json b/internal/validator/testdata/all-pass/expected.json index 2b2da11..313c688 100644 --- a/internal/validator/testdata/all-pass/expected.json +++ b/internal/validator/testdata/all-pass/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-bbox-length/expected.json b/internal/validator/testdata/bad-bbox-length/expected.json index a3febca..bfb63b3 100644 --- a/internal/validator/testdata/bad-bbox-length/expected.json +++ b/internal/validator/testdata/bad-bbox-length/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-bbox-type/expected.json b/internal/validator/testdata/bad-bbox-type/expected.json index 7e628bf..6de45a3 100644 --- a/internal/validator/testdata/bad-bbox-type/expected.json +++ b/internal/validator/testdata/bad-bbox-type/expected.json @@ -67,12 +67,12 @@ "passed": false }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": false, "passed": false }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": false, "passed": false }, diff --git a/internal/validator/testdata/bad-crs/expected.json b/internal/validator/testdata/bad-crs/expected.json index 5d78150..45c5c99 100644 --- a/internal/validator/testdata/bad-crs/expected.json +++ b/internal/validator/testdata/bad-crs/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-edges/expected.json b/internal/validator/testdata/bad-edges/expected.json index cc73c66..452a4f9 100644 --- a/internal/validator/testdata/bad-edges/expected.json +++ b/internal/validator/testdata/bad-edges/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-encoding/expected.json b/internal/validator/testdata/bad-encoding/expected.json index a53e60e..63fc1f6 100644 --- a/internal/validator/testdata/bad-encoding/expected.json +++ b/internal/validator/testdata/bad-encoding/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-epoch/expected.json b/internal/validator/testdata/bad-epoch/expected.json index 3808ac8..4c7dd30 100644 --- a/internal/validator/testdata/bad-epoch/expected.json +++ b/internal/validator/testdata/bad-epoch/expected.json @@ -67,12 +67,12 @@ "message": "expected \"epoch\" for column \"geometry\" to be a number, got \"bogus\"" }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": false, "passed": false }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": false, "passed": false }, diff --git a/internal/validator/testdata/bad-geometry-types/expected.json b/internal/validator/testdata/bad-geometry-types/expected.json index 81c5534..9ee4895 100644 --- a/internal/validator/testdata/bad-geometry-types/expected.json +++ b/internal/validator/testdata/bad-geometry-types/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-metadata-type/expected.json b/internal/validator/testdata/bad-metadata-type/expected.json index c86f9cb..da63caf 100644 --- a/internal/validator/testdata/bad-metadata-type/expected.json +++ b/internal/validator/testdata/bad-metadata-type/expected.json @@ -67,12 +67,12 @@ "passed": false }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": false, "passed": false }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": false, "passed": false }, diff --git a/internal/validator/testdata/bad-orientation/expected.json b/internal/validator/testdata/bad-orientation/expected.json index d4d01c3..cfb6e7f 100644 --- a/internal/validator/testdata/bad-orientation/expected.json +++ b/internal/validator/testdata/bad-orientation/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/bad-primary-column/expected.json b/internal/validator/testdata/bad-primary-column/expected.json index 4da4c4d..bd1cc92 100644 --- a/internal/validator/testdata/bad-primary-column/expected.json +++ b/internal/validator/testdata/bad-primary-column/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/geometry-correctly-oriented/expected.json b/internal/validator/testdata/geometry-correctly-oriented/expected.json index 2b2da11..313c688 100644 --- a/internal/validator/testdata/geometry-correctly-oriented/expected.json +++ b/internal/validator/testdata/geometry-correctly-oriented/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/geometry-incorrectly-oriented/expected.json b/internal/validator/testdata/geometry-incorrectly-oriented/expected.json index 5cb14ec..c2312ba 100644 --- a/internal/validator/testdata/geometry-incorrectly-oriented/expected.json +++ b/internal/validator/testdata/geometry-incorrectly-oriented/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/geometry-inside-antimeridian-spanning-bbox/expected.json b/internal/validator/testdata/geometry-inside-antimeridian-spanning-bbox/expected.json index 2b2da11..313c688 100644 --- a/internal/validator/testdata/geometry-inside-antimeridian-spanning-bbox/expected.json +++ b/internal/validator/testdata/geometry-inside-antimeridian-spanning-bbox/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/geometry-outside-antimeridian-spanning-bbox/expected.json b/internal/validator/testdata/geometry-outside-antimeridian-spanning-bbox/expected.json index 52ed2c2..f25e88e 100644 --- a/internal/validator/testdata/geometry-outside-antimeridian-spanning-bbox/expected.json +++ b/internal/validator/testdata/geometry-outside-antimeridian-spanning-bbox/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/geometry-outside-bbox/expected.json b/internal/validator/testdata/geometry-outside-bbox/expected.json index 0335eaa..a36adbb 100644 --- a/internal/validator/testdata/geometry-outside-bbox/expected.json +++ b/internal/validator/testdata/geometry-outside-bbox/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/geometry-type-not-in-list/expected.json b/internal/validator/testdata/geometry-type-not-in-list/expected.json index 845ef35..4de556d 100644 --- a/internal/validator/testdata/geometry-type-not-in-list/expected.json +++ b/internal/validator/testdata/geometry-type-not-in-list/expected.json @@ -66,12 +66,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/missing-columns/expected.json b/internal/validator/testdata/missing-columns/expected.json index fd0bf60..21b3d07 100644 --- a/internal/validator/testdata/missing-columns/expected.json +++ b/internal/validator/testdata/missing-columns/expected.json @@ -67,12 +67,12 @@ "passed": false }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": false, "passed": false }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": false, "passed": false }, diff --git a/internal/validator/testdata/missing-encoding/expected.json b/internal/validator/testdata/missing-encoding/expected.json index 207a565..4e1d107 100644 --- a/internal/validator/testdata/missing-encoding/expected.json +++ b/internal/validator/testdata/missing-encoding/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/missing-geometry-types/expected.json b/internal/validator/testdata/missing-geometry-types/expected.json index 54c6113..9418986 100644 --- a/internal/validator/testdata/missing-geometry-types/expected.json +++ b/internal/validator/testdata/missing-geometry-types/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/missing-primary-column/expected.json b/internal/validator/testdata/missing-primary-column/expected.json index 5edb22d..152039d 100644 --- a/internal/validator/testdata/missing-primary-column/expected.json +++ b/internal/validator/testdata/missing-primary-column/expected.json @@ -68,12 +68,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/testdata/missing-version/expected.json b/internal/validator/testdata/missing-version/expected.json index e2e6d9a..07a85b9 100644 --- a/internal/validator/testdata/missing-version/expected.json +++ b/internal/validator/testdata/missing-version/expected.json @@ -67,12 +67,12 @@ "passed": true }, { - "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "title": "geometry columns must not be grouped", "run": true, "passed": true }, { - "title": "geometry columns must not be grouped", + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", "run": true, "passed": true }, diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 19a9d00..8787e55 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -22,9 +22,12 @@ import ( "io" "os" + "github.com/apache/arrow/go/v14/arrow/array" + "github.com/apache/arrow/go/v14/parquet/file" + "github.com/paulmach/orb" + "github.com/planetlabs/gpq/internal/geo" "github.com/planetlabs/gpq/internal/geoparquet" _ "github.com/santhosh-tekuri/jsonschema/v5/httploader" - "github.com/segmentio/parquet-go" ) type Validator struct { @@ -47,8 +50,8 @@ func MetadataOnlyRules() []Rule { OptionalEdges(), OptionalBbox(), OptionalEpoch(), - GeometryDataType(), GeometryUngrouped(), + GeometryDataType(), GeometryRepetition(), } } @@ -91,27 +94,23 @@ type Check struct { // Validate opens and validates a GeoParquet file. func (v *Validator) Validate(ctx context.Context, resource string) (*Report, error) { - stat, statError := os.Stat(resource) - if statError != nil { - return nil, fmt.Errorf("failed to get size of %q: %w", resource, statError) - } - - input, readErr := os.Open(resource) - if readErr != nil { - return nil, fmt.Errorf("failed to read from %q: %w", resource, readErr) + input, inputErr := os.Open(resource) + if inputErr != nil { + return nil, fmt.Errorf("failed to read from %q: %w", resource, inputErr) } defer input.Close() - file, fileErr := parquet.OpenFile(input, stat.Size()) - if fileErr != nil { - return nil, fileErr + reader, readerErr := file.NewParquetReader(input) + if readerErr != nil { + return nil, fmt.Errorf("failed to create parquet reader from %q: %w", resource, readerErr) } + defer reader.Close() - return v.Report(ctx, file) + return v.Report(ctx, reader) } // Report generates a validation report for a GeoParquet file. -func (v *Validator) Report(ctx context.Context, file *parquet.File) (*Report, error) { +func (v *Validator) Report(ctx context.Context, file *file.Reader) (*Report, error) { checks := make([]*Check, len(v.rules)) for i, rule := range v.rules { checks[i] = &Check{ @@ -127,7 +126,7 @@ func (v *Validator) Report(ctx context.Context, file *parquet.File) (*Report, er } // run all metadata rules - metadataValue, metadataErr := geoparquet.GetMetadataValue(file) + metadataValue, metadataErr := geoparquet.GetMetadataValue(file.MetaData().KeyValueMetadata()) if metadataErr != nil { return nil, metadataErr } @@ -161,7 +160,7 @@ func (v *Validator) Report(ctx context.Context, file *parquet.File) (*Report, er } // run all rules that need the file and parsed metadata - metadata, err := geoparquet.GetMetadata(file) + metadata, err := geoparquet.GetMetadata(file.MetaData().KeyValueMetadata()) if err != nil { return nil, err } @@ -176,12 +175,19 @@ func (v *Validator) Report(ctx context.Context, file *parquet.File) (*Report, er } // run all the data scanning rules - rowReader := geoparquet.NewRowReader(file) + recordReader, rrErr := geoparquet.NewRecordReader(&geoparquet.ReaderConfig{ + File: file, + Context: ctx, + }) + if rrErr != nil { + return nil, rrErr + } + defer recordReader.Close() - encodedGeometryRules := []*RowRule[EncodedGeometryMap]{} + encodedGeometryRules := []*ColumnValueRule[any]{} encodedGeometryChecks := []*Check{} for i, r := range v.rules { - rule, ok := r.(*RowRule[EncodedGeometryMap]) + rule, ok := r.(*ColumnValueRule[any]) if ok { rule.Init(info) encodedGeometryRules = append(encodedGeometryRules, rule) @@ -189,10 +195,10 @@ func (v *Validator) Report(ctx context.Context, file *parquet.File) (*Report, er } } - decodedGeometryRules := []*RowRule[DecodedGeometryMap]{} + decodedGeometryRules := []*ColumnValueRule[orb.Geometry]{} decodedGeometryChecks := []*Check{} for i, r := range v.rules { - rule, ok := r.(*RowRule[DecodedGeometryMap]) + rule, ok := r.(*ColumnValueRule[orb.Geometry]) if ok { rule.Init(info) decodedGeometryRules = append(decodedGeometryRules, rule) @@ -200,56 +206,54 @@ func (v *Validator) Report(ctx context.Context, file *parquet.File) (*Report, er } } - schema := file.Schema() for { - row, readErr := rowReader.Next() - if readErr == io.EOF { + record, recordErr := recordReader.Read() + if recordErr == io.EOF { break } - if readErr != nil { - return nil, fmt.Errorf("failed to read row: %w", readErr) + if recordErr != nil { + return nil, fmt.Errorf("failed to read record: %w", recordErr) } + defer record.Release() - properties := map[string]any{} - if err := schema.Reconstruct(&properties, row); err != nil { - return nil, fmt.Errorf("failed to reconstruct row: %w", err) - } + schema := record.Schema() - encodedGeometryMap := EncodedGeometryMap{} - for name := range metadata.Columns { - value, ok := properties[name] - if !ok { - return nil, fmt.Errorf("missing column %q", name) - } - encodedGeometryMap[name] = value - } + arr := array.RecordToStructArray(record) + defer arr.Release() - for i, rule := range encodedGeometryRules { - check := encodedGeometryChecks[i] - if err := rule.Row(encodedGeometryMap); errors.Is(err, ErrFatal) { - check.Message = err.Error() - check.Run = true - return report, nil + for colNum := 0; colNum < arr.NumField(); colNum += 1 { + field := schema.Field(colNum) + geomColumn := metadata.Columns[field.Name] + if geomColumn == nil { + continue } - } - - decodedGeometryMap := DecodedGeometryMap{} - for name, value := range encodedGeometryMap { - decoded, _, err := geoparquet.Geometry(value, name, metadata, schema) - if err != nil { - return nil, fmt.Errorf("failed to decode geometry: %w", err) + values := arr.Field(colNum) + for rowNum := 0; rowNum < arr.Len(); rowNum += 1 { + value := values.GetOneForMarshal(rowNum) + for i, rule := range encodedGeometryRules { + check := encodedGeometryChecks[i] + if err := rule.Value(field.Name, value); errors.Is(err, ErrFatal) { + check.Message = err.Error() + check.Run = true + return report, nil + } + } + + geometry, err := geo.DecodeGeometry(value, geomColumn.Encoding) + if err != nil { + return nil, fmt.Errorf("failed to decode geometry for %q: %w", field.Name, err) + } + for i, rule := range decodedGeometryRules { + check := decodedGeometryChecks[i] + if err := rule.Value(field.Name, geometry.Geometry()); errors.Is(err, ErrFatal) { + check.Message = err.Error() + check.Run = true + return report, nil + } + } } - decodedGeometryMap[name] = decoded } - for i, rule := range decodedGeometryRules { - check := decodedGeometryChecks[i] - if err := rule.Row(decodedGeometryMap); errors.Is(err, ErrFatal) { - check.Message = err.Error() - check.Run = true - return report, nil - } - } } for i, rule := range encodedGeometryRules { diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index 8b33886..edbbbae 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -17,6 +17,7 @@ package validator_test import ( "bytes" "context" + "encoding/json" "fmt" "io" "net/url" @@ -25,11 +26,13 @@ import ( "strings" "testing" + "github.com/apache/arrow/go/v14/parquet" + "github.com/apache/arrow/go/v14/parquet/file" "github.com/planetlabs/gpq/internal/geojson" + "github.com/planetlabs/gpq/internal/geoparquet" + "github.com/planetlabs/gpq/internal/pqutil" "github.com/planetlabs/gpq/internal/validator" "github.com/santhosh-tekuri/jsonschema/v5" - "github.com/segmentio/encoding/json" - "github.com/segmentio/parquet-go" "github.com/stretchr/testify/suite" ) @@ -76,20 +79,35 @@ func (s *Suite) readSpec(name string) *Spec { return input } -func (s *Suite) generateGeoParquet(name string) *parquet.File { +func (s *Suite) copyWithMetadata(input parquet.ReaderAtSeeker, output io.Writer, metadata string) { + config := &pqutil.TransformConfig{ + Reader: input, + Writer: output, + BeforeClose: func(fileWriter *file.Writer) error { + return fileWriter.AppendKeyValueMetadata(geoparquet.MetadataKey, metadata) + }, + } + s.Require().NoError(pqutil.TransformByColumn(config)) +} + +func (s *Suite) generateGeoParquet(name string) *file.Reader { spec := s.readSpec(name) - output := &bytes.Buffer{} + initialOutput := &bytes.Buffer{} options := &geojson.ConvertOptions{ Metadata: string(spec.Metadata), } - s.Require().NoError(geojson.ToParquet(bytes.NewReader(spec.Data), output, options)) + s.Require().NoError(geojson.ToParquet(bytes.NewReader(spec.Data), initialOutput, options)) + + input := bytes.NewReader(initialOutput.Bytes()) + output := &bytes.Buffer{} + s.copyWithMetadata(input, output, string(spec.Metadata)) - file, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len())) + fileReader, err := file.NewParquetReader(bytes.NewReader(output.Bytes())) s.Require().NoError(err) - return file + return fileReader } func (s *Suite) assertExpectedReport(name string, report *validator.Report) {