From eb47992868364db0487d30e20f56646e94e76530 Mon Sep 17 00:00:00 2001 From: Robin Tang Date: Thu, 19 Sep 2024 13:51:35 -0700 Subject: [PATCH] Simplifying Merge Columns (#917) --- lib/optimization/table_data.go | 30 ++-- ...st.go => table_data_merge_columns_test.go} | 134 ++++++++---------- lib/typing/ext/time.go | 2 + 3 files changed, 71 insertions(+), 95 deletions(-) rename lib/optimization/{event_update_test.go => table_data_merge_columns_test.go} (74%) diff --git a/lib/optimization/table_data.go b/lib/optimization/table_data.go index d0fcc5e6c..4d6cc3d2e 100644 --- a/lib/optimization/table_data.go +++ b/lib/optimization/table_data.go @@ -2,7 +2,6 @@ package optimization import ( "fmt" - "log/slog" "strings" "time" @@ -273,34 +272,27 @@ func (t *TableData) MergeColumnsFromDestination(destCols ...columns.Column) erro } if found { - // TODO: Simplify this whole logic - // If the inMemoryColumn is decimal and foundColumn is integer, don't copy it over. - // This is because parsing NUMERIC(...) will return an INTEGER if there's no decimal point. - // However, this will wipe the precision unit from the INTEGER which may cause integer overflow. - shouldSkip := inMemoryCol.KindDetails.Kind == typing.EDecimal.Kind && foundColumn.KindDetails.Kind == typing.Integer.Kind - if shouldSkip { - slog.Warn("Skipping column", slog.String("column", inMemoryCol.Name()), slog.String("inMemoryKind", inMemoryCol.KindDetails.Kind), slog.String("foundKind", foundColumn.KindDetails.Kind)) - } else { - // We should take `kindDetails.kind` and `backfilled` from foundCol - // We are not taking primaryKey and defaultValue because DWH does not have this information. - // Note: If our in-memory column is `Invalid`, it would get skipped during merge. However, if the column exists in - // the destination, we'll copy the type over. This is to make sure we don't miss batch updates where the whole column in the batch is NULL. - inMemoryCol.KindDetails.Kind = foundColumn.KindDetails.Kind - if foundColumn.KindDetails.OptionalStringPrecision != nil { - inMemoryCol.KindDetails.OptionalStringPrecision = foundColumn.KindDetails.OptionalStringPrecision - } - } + inMemoryCol.KindDetails.Kind = foundColumn.KindDetails.Kind + // Copy over backfilled inMemoryCol.SetBackfilled(foundColumn.Backfilled()) + + // Copy over string precision, if it exists + if foundColumn.KindDetails.OptionalStringPrecision != nil { + inMemoryCol.KindDetails.OptionalStringPrecision = foundColumn.KindDetails.OptionalStringPrecision + } + + // Copy over the time details if foundColumn.KindDetails.ExtendedTimeDetails != nil { if inMemoryCol.KindDetails.ExtendedTimeDetails == nil { inMemoryCol.KindDetails.ExtendedTimeDetails = &ext.NestedKind{} } - // Don't have tcKind.ExtendedTimeDetails update the format since the DWH will not have that. + // Just copy over the type since the format wouldn't be present in the destination inMemoryCol.KindDetails.ExtendedTimeDetails.Type = foundColumn.KindDetails.ExtendedTimeDetails.Type } + // Copy over the decimal details if foundColumn.KindDetails.ExtendedDecimalDetails != nil && inMemoryCol.KindDetails.ExtendedDecimalDetails == nil { inMemoryCol.KindDetails.ExtendedDecimalDetails = foundColumn.KindDetails.ExtendedDecimalDetails } diff --git a/lib/optimization/event_update_test.go b/lib/optimization/table_data_merge_columns_test.go similarity index 74% rename from lib/optimization/event_update_test.go rename to lib/optimization/table_data_merge_columns_test.go index 4fae39c09..3a1791922 100644 --- a/lib/optimization/event_update_test.go +++ b/lib/optimization/table_data_merge_columns_test.go @@ -10,104 +10,91 @@ import ( "github.com/stretchr/testify/assert" ) -const strCol = "string" - func TestTableData_UpdateInMemoryColumnsFromDestination(t *testing.T) { + const strCol = "string" + tableDataCols := &columns.Columns{} + tableData := &TableData{inMemoryColumns: tableDataCols} { - tableDataCols := &columns.Columns{} - tableData := &TableData{ - inMemoryColumns: tableDataCols, - } - + // Trying to merge an invalid destination column tableData.AddInMemoryCol(columns.NewColumn("foo", typing.String)) invalidCol := columns.NewColumn("foo", typing.Invalid) assert.ErrorContains(t, tableData.MergeColumnsFromDestination(invalidCol), `column "foo" is invalid`) } { - // If the in-memory column is a string and the destination column is Date - // We should mark the in-memory column as date and try to parse it accordingly. - tableDataCols := &columns.Columns{} - tableData := &TableData{ - inMemoryColumns: tableDataCols, - } - + // In-memory column is a string and the destination column is a Date tableData.AddInMemoryCol(columns.NewColumn("foo", typing.String)) extTime := typing.ETime - extTime.ExtendedTimeDetails = &ext.NestedKind{ - Type: ext.DateKindType, - } - + extTime.ExtendedTimeDetails = &ext.Date tsCol := columns.NewColumn("foo", extTime) assert.NoError(t, tableData.MergeColumnsFromDestination(tsCol)) - col, isOk := tableData.inMemoryColumns.GetColumn("foo") + updatedColumn, isOk := tableData.inMemoryColumns.GetColumn("foo") assert.True(t, isOk) - assert.Equal(t, typing.ETime.Kind, col.KindDetails.Kind) - assert.Equal(t, ext.DateKindType, col.KindDetails.ExtendedTimeDetails.Type) - assert.Equal(t, extTime.ExtendedTimeDetails, col.KindDetails.ExtendedTimeDetails) + assert.Equal(t, typing.ETime.Kind, updatedColumn.KindDetails.Kind) + assert.Equal(t, ext.DateKindType, updatedColumn.KindDetails.ExtendedTimeDetails.Type) + // Format is not copied over. + assert.Equal(t, "", updatedColumn.KindDetails.ExtendedTimeDetails.Format) } { - tableDataCols := &columns.Columns{} - tableData := &TableData{ - inMemoryColumns: tableDataCols, - } - - tableDataCols.AddColumn(columns.NewColumn("name", typing.String)) - tableDataCols.AddColumn(columns.NewColumn("bool_backfill", typing.Boolean)) - tableDataCols.AddColumn(columns.NewColumn("prev_invalid", typing.Invalid)) + // In-memory column is NUMERIC and destination column is an INTEGER tableDataCols.AddColumn(columns.NewColumn("numeric_test", typing.EDecimal)) + assert.NoError(t, tableData.MergeColumnsFromDestination(columns.NewColumn("numeric_test", typing.Integer))) - // Casting these as STRING so tableColumn via this f(x) will set it correctly. - tableDataCols.AddColumn(columns.NewColumn("ext_date", typing.String)) - tableDataCols.AddColumn(columns.NewColumn("ext_time", typing.String)) - tableDataCols.AddColumn(columns.NewColumn("ext_datetime", typing.String)) - tableDataCols.AddColumn(columns.NewColumn("ext_dec", typing.String)) - - extDecimalType := typing.NewDecimalDetailsFromTemplate(typing.EDecimal, decimal.NewDetails(22, 2)) - tableDataCols.AddColumn(columns.NewColumn("ext_dec_filled", extDecimalType)) + numericCol, isOk := tableData.inMemoryColumns.GetColumn("numeric_test") + assert.True(t, isOk) + assert.Equal(t, typing.Integer.Kind, numericCol.KindDetails.Kind) + } + { + // Boolean column that has been backfilled + tableDataCols.AddColumn(columns.NewColumn("bool_backfill", typing.Boolean)) + backfilledCol := columns.NewColumn("bool_backfill", typing.Boolean) + backfilledCol.SetBackfilled(true) - tableDataCols.AddColumn(columns.NewColumn(strCol, typing.String)) + // Backfill was not set + column, isOk := tableData.inMemoryColumns.GetColumn("bool_backfill") + assert.True(t, isOk) + assert.False(t, column.Backfilled()) + assert.NoError(t, tableData.MergeColumnsFromDestination(backfilledCol)) + // Backfill is set after merge. + column, isOk = tableData.inMemoryColumns.GetColumn("bool_backfill") + assert.True(t, isOk) + assert.True(t, column.Backfilled()) + } + { + // Non-existent columns should not be copied over. nonExistentTableCols := []string{"dusty", "the", "mini", "aussie"} var nonExistentCols []columns.Column for _, nonExistentTableCol := range nonExistentTableCols { nonExistentCols = append(nonExistentCols, columns.NewColumn(nonExistentTableCol, typing.String)) } - // Testing to make sure we don't copy over non-existent columns assert.NoError(t, tableData.MergeColumnsFromDestination(nonExistentCols...)) for _, nonExistentTableCol := range nonExistentTableCols { _, isOk := tableData.inMemoryColumns.GetColumn(nonExistentTableCol) assert.False(t, isOk, nonExistentTableCol) } + } + { + // In-memory column was invalid, but the destination column is valid + tableDataCols.AddColumn(columns.NewColumn("invalid_test", typing.Invalid)) + assert.NoError(t, tableData.MergeColumnsFromDestination(columns.NewColumn("invalid_test", typing.String))) - // Making sure it's still numeric - assert.NoError(t, tableData.MergeColumnsFromDestination(columns.NewColumn("numeric_test", typing.Integer))) - numericCol, isOk := tableData.inMemoryColumns.GetColumn("numeric_test") - assert.True(t, isOk) - assert.Equal(t, typing.EDecimal.Kind, numericCol.KindDetails.Kind, "numeric_test") - - // Testing to make sure we're copying the kindDetails over. - assert.NoError(t, tableData.MergeColumnsFromDestination(columns.NewColumn("prev_invalid", typing.String))) - prevInvalidCol, isOk := tableData.inMemoryColumns.GetColumn("prev_invalid") + invalidCol, isOk := tableData.inMemoryColumns.GetColumn("invalid_test") assert.True(t, isOk) - assert.Equal(t, typing.String, prevInvalidCol.KindDetails) + assert.Equal(t, typing.String.Kind, invalidCol.KindDetails.Kind) + } + { + // Casting these as STRING so tableColumn via this f(x) will set it correctly. + tableDataCols.AddColumn(columns.NewColumn("ext_date", typing.String)) + tableDataCols.AddColumn(columns.NewColumn("ext_time", typing.String)) + tableDataCols.AddColumn(columns.NewColumn("ext_datetime", typing.String)) + tableDataCols.AddColumn(columns.NewColumn("ext_dec", typing.String)) - // Testing backfill - for _, inMemoryCol := range tableData.inMemoryColumns.GetColumns() { - assert.False(t, inMemoryCol.Backfilled(), inMemoryCol.Name()) - } - backfilledCol := columns.NewColumn("bool_backfill", typing.Boolean) - backfilledCol.SetBackfilled(true) - assert.NoError(t, tableData.MergeColumnsFromDestination(backfilledCol)) - for _, inMemoryCol := range tableData.inMemoryColumns.GetColumns() { - if inMemoryCol.Name() == backfilledCol.Name() { - assert.True(t, inMemoryCol.Backfilled(), inMemoryCol.Name()) - } else { - assert.False(t, inMemoryCol.Backfilled(), inMemoryCol.Name()) - } - } + extDecimalType := typing.NewDecimalDetailsFromTemplate(typing.EDecimal, decimal.NewDetails(22, 2)) + tableDataCols.AddColumn(columns.NewColumn("ext_dec_filled", extDecimalType)) + tableDataCols.AddColumn(columns.NewColumn(strCol, typing.String)) // Testing extTimeDetails for _, extTimeDetailsCol := range []string{"ext_date", "ext_time", "ext_datetime"} { @@ -169,20 +156,15 @@ func TestTableData_UpdateInMemoryColumnsFromDestination(t *testing.T) { assert.Equal(t, int32(2), extDecColFilled.KindDetails.ExtendedDecimalDetails.Scale()) } { - tableDataCols := &columns.Columns{} - tableData := &TableData{ - inMemoryColumns: tableDataCols, - } - + // String (precision being copied over) tableDataCols.AddColumn(columns.NewColumn(strCol, typing.String)) + assert.NoError(t, tableData.MergeColumnsFromDestination(columns.NewColumn(strCol, + typing.KindDetails{ + Kind: typing.String.Kind, + OptionalStringPrecision: typing.ToPtr(int32(123)), + }), + )) - // Testing string precision - stringKindWithPrecision := typing.KindDetails{ - Kind: typing.String.Kind, - OptionalStringPrecision: typing.ToPtr(int32(123)), - } - - assert.NoError(t, tableData.MergeColumnsFromDestination(columns.NewColumn(strCol, stringKindWithPrecision))) foundStrCol, isOk := tableData.inMemoryColumns.GetColumn(strCol) assert.True(t, isOk) assert.Equal(t, typing.String.Kind, foundStrCol.KindDetails.Kind) diff --git a/lib/typing/ext/time.go b/lib/typing/ext/time.go index 3294d99e2..2b05fc462 100644 --- a/lib/typing/ext/time.go +++ b/lib/typing/ext/time.go @@ -5,6 +5,8 @@ import ( "time" ) +// TODO: This package should have a concept of default formats for each type. + type ExtendedTimeKindType string const (