Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: data list metadata #2529

Merged
merged 8 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions packages/data-models/flowTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,26 @@ export namespace FlowTypes {
flow_type: "asset_pack";
rows: Data_listRow<IAssetEntry>[];
}

/**
* Data types supported within data_lists
* These are a subset of types provided by Json Schema standards
* https://json-schema.org/draft/2020-12/json-schema-core#name-instance-data-model
* */
export type Data_listColumnType = "boolean" | "number" | "object" | "string";

export interface Data_listColumnMetadata {
/** Column data type. Default "string" when not defined */
type?: Data_listColumnType;
}

export interface Data_list extends FlowTypeWithData {
flow_type: "data_list";
rows: Data_listRow[];
/** Hashmap of any additional column metadata (omitted if all columns default string type) */
_metadata?: {
[column_name: string]: Data_listColumnMetadata;
};
}
export interface DataPipeFlow extends FlowTypeWithData {
flow_type: "data_pipe";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { IConverterPaths, IFlowHashmapByType, IParsedWorkbookData } from "../../
import { arrayToHashmap, groupJsonByKey, IContentsEntry } from "../../utils";
import BaseProcessor from "../base";

const cacheVersion = 20241024.0;
const cacheVersion = 20241114.2;

export class FlowParserProcessor extends BaseProcessor<FlowTypes.FlowTypeWithData> {
public parsers: { [flowType in FlowTypes.FlowType]: Parsers.DefaultParser } = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,97 @@
import { FlowTypes } from "data-models";
import { DataListParser } from ".";
import { TEST_DATA_PATHS } from "../../../../../../../test/helpers/utils";
import { TEST_DATA_PATHS, useMockLogger } from "../../../../../../../test/helpers/utils";
import { FlowParserProcessor } from "../flowParser";

const testFlow = {
const MOCK_DATA_LIST = (): FlowTypes.Data_list => ({
flow_type: "data_list",
flow_name: "test_data_list",
flow_name: "mock_data_list",
rows: [
{
id: "id_1",
number: 1,
text: "hello",
boolean: true,
nested: { string: "hello" },
},
{
id: "id_1",
number: 1,
text: "hello",
boolean: true,
nested: { string: "hello" },
additional: false,
},
],
_xlsxPath: "/mock/data/path",
});

/***********************************************************************
* Basic
**********************************************************************/

/** yarn workspace scripts test -t data_list.parser.spec.ts **/
describe("data_list Parser Metadata", () => {
let parser: DataListParser;

beforeAll(() => {
parser = new DataListParser({ processedFlowHashmap: {} } as any);
});
it("Infers column metadata from data (for non-string columns)", () => {
const { _metadata } = parser.postProcessFlow(MOCK_DATA_LIST());
expect(_metadata).toEqual({
number: { type: "number" },
boolean: { type: "boolean" },
nested: { type: "object" },
// data missing or undefined in first row identified from subsequent
additional: { type: "boolean" },
});
});
it("Assigns column metadata from metadata row", () => {
const flowWithMetadataRow = {
...MOCK_DATA_LIST(),
rows: [
{ id: "@metadata", number: "type: number", text: "type: string" },
{ id: "id_1", number: undefined, text: undefined },
],
};
const { _metadata } = parser.postProcessFlow(flowWithMetadataRow);
expect(_metadata).toEqual({
number: { type: "number" },
});
});
it("Omits metadata when all columns are strings", () => {
const flowWithStringColumns = {
...MOCK_DATA_LIST(),
rows: [{ id: "id_1", text: "hello" }],
};
const { _metadata } = parser.postProcessFlow(flowWithStringColumns);
expect(_metadata).toBeUndefined();
});

// eslint-disable-next-line jest/no-commented-out-tests
// it("QC metadata", () => {
// const loggerSpy = useMockLogger(false);
// const flowWithMetadataRow = {
// ...MOCK_DATA_LIST(),
// rows: [{ id: "id_1", number: 1, text: undefined }],
// };
// parser.postProcessFlow(flowWithMetadataRow);
// expect(loggerSpy.warning).toHaveBeenCalledTimes(1);
// expect(loggerSpy.warning).toHaveBeenCalledWith({
// msg1: "[mock_data_list] Data List validation has (1) warnings",
// msg2: "/mock/data/path",
// });
// });
});

/***********************************************************************
* Advanced use cases
**********************************************************************/

const MOCK_DATA_LIST_FLOW_ADVANCED = (): FlowTypes.Data_list => ({
flow_type: "data_list",
flow_name: "mock_data_list_advanced",
rows: [
{
id: "test_1",
Expand All @@ -14,18 +101,18 @@ const testFlow = {
"nested.test_2": "two",
},
],
};
});

/** yarn workspace scripts test -t data_list.parser.spec.ts **/
describe("data_list Parser (single)", () => {
let outputRows: any[];
let parser: DataListParser;

beforeAll(() => {
const parser = new DataListParser({ processedFlowHashmap: {} } as any);
const output = parser.run(testFlow as any);
outputRows = output.rows;
parser = new DataListParser({ processedFlowHashmap: {} } as any);
});

it("Extracts condition_list", async () => {
const { test_condition_list } = outputRows[0];
const { rows } = parser.run(MOCK_DATA_LIST_FLOW_ADVANCED() as any);
const { test_condition_list } = rows[0];
expect(test_condition_list).toEqual([
{
condition_type: "calc",
Expand All @@ -35,12 +122,14 @@ describe("data_list Parser (single)", () => {
]);
});
it("Extracts nested properties", async () => {
const { nested } = outputRows[0];
const { rows } = parser.run(MOCK_DATA_LIST_FLOW_ADVANCED() as any);
const { nested } = rows[0];
expect(nested).toEqual({ test_1: 1, test_2: "two" });
});
// TODO - likely deprecated (can't see in codebase)
it("Extracts notification_schedule", async () => {
const { test_notification_schedule } = outputRows[0];
const { rows } = parser.run(MOCK_DATA_LIST_FLOW_ADVANCED() as any);
const { test_notification_schedule } = rows[0];
expect(test_notification_schedule).toEqual({ key_1: "value_1", key_2: "value_2" });
});
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
setNestedProperty,
} from "../../../utils";
import { DefaultParser } from "./default.parser";
import { isEmptyObjectDeep, isObjectLiteral, Logger } from "shared";

export class DataListParser extends DefaultParser {
postProcessRow(row: FlowTypes.Data_listRow) {
Expand Down Expand Up @@ -33,8 +34,102 @@ export class DataListParser extends DefaultParser {
return row;
}

public postProcessFlows(flows: FlowTypes.FlowTypeWithData[]) {
public override postProcessFlow(flow: FlowTypes.Data_list) {
const metadata = this.getFlowMetadata(flow);
if (!isEmptyObjectDeep(metadata)) {
flow._metadata = metadata;
}
return flow;
}

public postProcessFlows(flows: FlowTypes.Data_list[]) {
const flowsWithOverrides = assignFlowOverrides(flows);
return flowsWithOverrides;
}

/** Assign column metadata from @metadata row if provided, or infer from data if not*/
private getFlowMetadata(flow: FlowTypes.FlowTypeWithData) {
const [firstRow] = flow.rows;
const initialMetadata =
firstRow?.id === "@metadata"
? this.assignMetadataFromRow(firstRow)
: this.inferMetadataFromData(flow.rows);

const { metadata, warnings } = this.qualityCheckMetadata(initialMetadata);
if (warnings.length > 0) {
for (const warning of warnings) {
console.warn(warning);
}
Logger.warning({
msg1: `[${flow.flow_name}] Data List validation has (${warnings.length}) warnings`,
msg2: flow._xlsxPath,
});
}
return metadata;
}

/** Assign data_list metadata using a first `@metadata` row */
private assignMetadataFromRow(metadataRow: FlowTypes.Data_listRow) {
// do not extract metadata for id column
const { id, ...values } = metadataRow;
const metadata: FlowTypes.Data_list["_metadata"] = {};
for (const [field, metadataString] of Object.entries(values)) {
if (metadataString) {
metadata[field] = parseAppDataCollectionString(metadataString as string) as any;
}
}
return metadata;
}

/** Infer data_list metadata from data */
private inferMetadataFromData(rows: FlowTypes.Data_listRow[]) {
const metadata: FlowTypes.Data_list["_metadata"] = {};
for (const row of rows) {
// do not extract metadata for id column
const { id, ...values } = row;
for (const [field, value] of Object.entries(values)) {
// do not assign type to other metadata columns
if (!field.startsWith("_")) {
// only assign type if previously unassigned
if (!metadata[field]?.type) {
const inferredType = this.inferColumnDataType(value);
metadata[field] ??= {};
metadata[field].type = inferredType;
}
}
}
}
return metadata;
}

private inferColumnDataType(value: any): FlowTypes.Data_listColumnType {
if (value === undefined) return undefined;
if (typeof value === "string") return "string";
if (typeof value === "number") return "number";
if (typeof value === "boolean") return "boolean";
if (isObjectLiteral(value)) return "object";
return undefined;
}

private qualityCheckMetadata(metadata: FlowTypes.Data_list["_metadata"]) {
const warnings: string[] = [];
for (const [columnName, columnMetadata] of Object.entries(metadata)) {
// omit metadata for string or undefined types, will used default "string" at runtime
if (columnMetadata.type === "string" || columnMetadata.type === undefined) {
delete metadata[columnName];
}

/**
* TODO - consider whether check below actually useful to have
* ensure all columns have data type
*/
// if (columnMetadata.type === undefined) {
// metadata[columnName].type = "string";
// warnings.push(
// `"${columnName}" column cannot infer type (assume string)\nEither include an @metadata column or add example value to a row`
// );
// }
}
return { warnings, metadata };
}
}
2 changes: 1 addition & 1 deletion packages/scripts/test/helpers/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { Logger } from "shared/src/utils/logging/console-logger";
* @param callOriginal specify whether to still call original Logger
* methods after spy intercept
*/
export function useMockLogger(callOriginal = true) {
export function useMockLogger(callOriginal = false) {
const error = jest.spyOn(Logger, "error");
const warning = jest.spyOn(Logger, "warning");
if (!callOriginal) {
Expand Down
Loading