From cfe06492d6f695623c1f1bf0c8935224d74b4887 Mon Sep 17 00:00:00 2001 From: rliget Date: Thu, 8 Feb 2024 14:23:59 +0100 Subject: [PATCH] initial fix of issue 67, need to test further --- .../functions/scalar/iterativelength.cpp | 34 ++- scripts/kuzu_shortest_path.py | 37 ++++ .../sql/path-finding/shortest_path_bound.test | 199 ++++++++++++++++++ 3 files changed, 267 insertions(+), 3 deletions(-) create mode 100644 scripts/kuzu_shortest_path.py create mode 100644 test/sql/path-finding/shortest_path_bound.test diff --git a/duckpgq/src/duckpgq/functions/scalar/iterativelength.cpp b/duckpgq/src/duckpgq/functions/scalar/iterativelength.cpp index 61746d41..533d9e2d 100644 --- a/duckpgq/src/duckpgq/functions/scalar/iterativelength.cpp +++ b/duckpgq/src/duckpgq/functions/scalar/iterativelength.cpp @@ -74,6 +74,16 @@ static void IterativeLengthFunction(DataChunk &args, ExpressionState &state, auto src_data = (int64_t *)vdata_src.data; auto dst_data = (int64_t *)vdata_dst.data; + // get lowerbound and upperbound + auto &lower_bound = args.data[4]; + auto &upper_bound = args.data[5]; + UnifiedVectorFormat vdata_lower_bound; + UnifiedVectorFormat vdata_upper_bound; + lower_bound.ToUnifiedFormat(args.size(), vdata_lower_bound); + upper_bound.ToUnifiedFormat(args.size(), vdata_upper_bound); + auto lower_bound_data = (int64_t *)vdata_lower_bound.data; + auto upper_bound_data = (int64_t *)vdata_upper_bound.data; + ValidityMask &result_validity = FlatVector::Validity(result); // create result vector @@ -115,6 +125,7 @@ static void IterativeLengthFunction(DataChunk &args, ExpressionState &state, result_data[search_num] = (uint64_t)0; // path of length 0 does not require a search } else { + seen[src_data[src_pos]][lane] = true; visit1[src_data[src_pos]][lane] = true; lane_to_num[lane] = search_num; // active lane active++; @@ -134,9 +145,25 @@ static void IterativeLengthFunction(DataChunk &args, ExpressionState &state, int64_t search_num = lane_to_num[lane]; if (search_num >= 0) { // active lane int64_t dst_pos = vdata_dst.sel->get_index(search_num); - if (seen[dst_data[dst_pos]][lane]) { - result_data[search_num] = - iter; /* found at iter => iter = path length */ + if (seen[dst_data[dst_pos]][lane]){ + + // check if the path length is within bounds + // bound vector is either a constant or a flat vector + if (lower_bound.GetVectorType() == VectorType::CONSTANT_VECTOR ? + iter < lower_bound_data[0] : iter < lower_bound_data[dst_pos]) { + // when reach the destination too early, treat destination as null + // looks like the graph does not have that vertex + seen[dst_data[dst_pos]][lane] = false; + (iter & 1) ? visit2[dst_data[dst_pos]][lane] = false + : visit1[dst_data[dst_pos]][lane] = false; + continue; + } else if (upper_bound.GetVectorType() == VectorType::CONSTANT_VECTOR ? + iter > upper_bound_data[0] : iter > upper_bound_data[dst_pos]) { + result_data[search_num] = (int64_t)-1; /* no path */ + } else { + result_data[search_num] = + iter; /* found at iter => iter = path length */ + } lane_to_num[lane] = -1; // mark inactive active--; } @@ -160,6 +187,7 @@ static void IterativeLengthFunction(DataChunk &args, ExpressionState &state, CreateScalarFunctionInfo DuckPGQFunctions::GetIterativeLengthFunction() { auto fun = ScalarFunction("iterativelength", {LogicalType::INTEGER, LogicalType::BIGINT, + LogicalType::BIGINT, LogicalType::BIGINT, LogicalType::BIGINT, LogicalType::BIGINT}, LogicalType::BIGINT, IterativeLengthFunction, IterativeLengthFunctionData::IterativeLengthBind); diff --git a/scripts/kuzu_shortest_path.py b/scripts/kuzu_shortest_path.py new file mode 100644 index 00000000..7cf2bb89 --- /dev/null +++ b/scripts/kuzu_shortest_path.py @@ -0,0 +1,37 @@ +import kuzu +import pandas as pd + +db = kuzu.Database('./test') +conn = kuzu.Connection(db) + +# Drop the table if it exists: +try: + conn.execute("DROP TABLE knows") +except: + pass +try: + conn.execute("DROP TABLE Person") +except: + pass + +# Define the schema: +conn.execute("CREATE NODE TABLE Person (creationDate TIMESTAMP, id INT64, firstName STRING, lastName STRING, gender STRING, birthday DATE, locationIP STRING, browserUsed STRING, LocationCityId INT64, speaks STRING, email STRING, PRIMARY KEY (id))") +conn.execute("CREATE REL TABLE knows (FROM Person TO Person)") + +# Load the data: +conn.execute("Copy Person FROM './test/person.csv'") +conn.execute("Copy knows FROM './test/person_knows_person.csv'") + +# Calculate the shortest path between two people with bounded distance: +MIN_DISTANCE = 0 +MAX_DISTANCE = 30 +results = pd.DataFrame() +for low in range(MIN_DISTANCE, MAX_DISTANCE + 1): + for high in range(low, MAX_DISTANCE + 1): + result = conn.execute("MATCH (a:Person)-[e:knows*%d..%d]->(b:Person) RETURN a.id, b.id, length(e) AS distance ORDER BY distance ASC" % (low, high)).get_as_df() + result = result.drop_duplicates(subset=['a.id', 'b.id'], keep='first') + result['min_distance'] = low + result['max_distance'] = high + results = pd.concat([results, result], ignore_index=True) + +results.to_csv('./test/shortest_length_kuzu.csv', index=False) \ No newline at end of file diff --git a/test/sql/path-finding/shortest_path_bound.test b/test/sql/path-finding/shortest_path_bound.test new file mode 100644 index 00000000..f0008d5d --- /dev/null +++ b/test/sql/path-finding/shortest_path_bound.test @@ -0,0 +1,199 @@ +# name: test/sql/sqlpgq/shortest_path_bound.test +# group: [sqlpgq] + +statement ok +pragma enable_verification + +require duckpgq + +# Graph to test regular shortest path bound +# (0) -> (1) +# ↓ ↑ +# (2) -> (3) + +statement ok +CREATE TABLE Point(id BIGINT); INSERT INTO Point VALUES (0), (1), (2), (3); + +statement ok +CREATE TABLE know(src BIGINT, dst BIGINT); INSERT INTO know VALUES (0, 1), (0, 2), (2, 3), (3, 1); + +statement ok +-CREATE PROPERTY GRAPH pg +VERTEX TABLES ( + Point PROPERTIES ( id ) LABEL Pnt + ) +EDGE TABLES ( + know SOURCE KEY ( src ) REFERENCES Point ( id ) + DESTINATION KEY ( dst ) REFERENCES Point ( id ) + LABEL Knows + ); + +query III +WITH cte1 AS ( + SELECT CREATE_CSR_EDGE( + 0, + (SELECT count(a.id) FROM Point a), + CAST ( + (SELECT sum(CREATE_CSR_VERTEX( + 0, + (SELECT count(a.id) FROM Point a), + sub.dense_id, + sub.cnt) + ) + FROM ( + SELECT a.rowid as dense_id, count(k.src) as cnt + FROM Point a + LEFT JOIN Know k ON k.src = a.id + GROUP BY a.rowid) sub + ) + AS BIGINT), + a.rowid, + c.rowid, + k.rowid) as temp + FROM Know k + JOIN Point a on a.id = k.src + JOIN Point c on c.id = k.dst +) SELECT a.id as srd_id, b.id as dst_id, iterativelength(0, (select count(*) from Point), a.rowid, b.rowid, 2, 3) as path_length + FROM Point a, Point b, (select count(cte1.temp) * 0 as temp from cte1) __x + WHERE a.id = 0 and __x.temp * 0 + iterativelength(0, (select count(*) from Point), a.rowid, b.rowid, 2, 3); +---- +0 1 3 +0 3 2 + +query III +WITH cte1 AS ( + SELECT CREATE_CSR_EDGE( + 0, + (SELECT count(a.id) FROM Point a), + CAST ( + (SELECT sum(CREATE_CSR_VERTEX( + 0, + (SELECT count(a.id) FROM Point a), + sub.dense_id, + sub.cnt) + ) + FROM ( + SELECT a.rowid as dense_id, count(k.src) as cnt + FROM Point a + LEFT JOIN Know k ON k.src = a.id + GROUP BY a.rowid) sub + ) + AS BIGINT), + a.rowid, + c.rowid, + k.rowid) as temp + FROM Know k + JOIN Point a on a.id = k.src + JOIN Point c on c.id = k.dst +) SELECT a.id, b.id, iterativelength(0, (select count(*) from Point), a.rowid, b.rowid, 1, 3) as path_length + FROM Point a, Point b, (select count(cte1.temp) * 0 as temp from cte1) __x + WHERE a.id = 0 and __x.temp * 0 + iterativelength(0, (select count(*) from Point), a.rowid, b.rowid, 1, 3); +---- +0 1 1 +0 2 1 +0 3 2 + +# Graph to test shortest path bound with a cycle +# (0) --> (1) <-> (2) +# 0 to 1 is 1 hop + +statement ok +CREATE TABLE Point2(id BIGINT); INSERT INTO Point2 VALUES (0), (1), (2); + +statement ok +CREATE TABLE know2(src BIGINT, dst BIGINT); INSERT INTO know2 VALUES (0, 1), (1, 2), (2, 1); + +statement ok +-CREATE PROPERTY GRAPH pg2 +VERTEX TABLES ( + Point2 PROPERTIES ( id ) LABEL Pnt2 + ) +EDGE TABLES ( + know2 SOURCE KEY ( src ) REFERENCES Point2 ( id ) + DESTINATION KEY ( dst ) REFERENCES Point2 ( id ) + LABEL Knows2 + ); + +query III +WITH cte1 AS ( + SELECT CREATE_CSR_EDGE( + 0, + (SELECT count(a.id) FROM Point2 a), + CAST ( + (SELECT sum(CREATE_CSR_VERTEX( + 0, + (SELECT count(a.id) FROM Point2 a), + sub.dense_id, + sub.cnt) + ) + FROM ( + SELECT a.rowid as dense_id, count(k.src) as cnt + FROM Point2 a + LEFT JOIN know2 k ON k.src = a.id + GROUP BY a.rowid) sub + ) + AS BIGINT), + a.rowid, + c.rowid, + k.rowid) as temp + FROM know2 k + JOIN Point2 a on a.id = k.src + JOIN Point2 c on c.id = k.dst +) SELECT a.id, b.id, iterativelength(0, (select count(*) from Point2), a.rowid, b.rowid, 2, 3) as path_length + FROM Point2 a, Point2 b, (select count(cte1.temp) * 0 as temp from cte1) __x + WHERE a.id = 0 and __x.temp * 0 + iterativelength(0, (select count(*) from Point2), a.rowid, b.rowid, 2, 3); +---- +0 2 2 + + +# Graph to test shortest path bound with a cycle +# (1) <- (0) <-> (2) +# 0 to 1 is 1 hop + +statement ok +CREATE TABLE Point3(id BIGINT); INSERT INTO Point3 VALUES (0), (1), (2); + +statement ok +CREATE TABLE know3(src BIGINT, dst BIGINT); INSERT INTO know3 VALUES (0, 1), (0, 2), (2, 0); + +statement ok +-CREATE PROPERTY GRAPH pg3 +VERTEX TABLES ( + Point3 PROPERTIES ( id ) LABEL Pnt + ) +EDGE TABLES ( + know3 SOURCE KEY ( src ) REFERENCES Point3 ( id ) + DESTINATION KEY ( dst ) REFERENCES Point3 ( id ) + LABEL Knows + ); + +query III +WITH cte1 AS ( + SELECT CREATE_CSR_EDGE( + 0, + (SELECT count(a.id) FROM Point3 a), + CAST ( + (SELECT sum(CREATE_CSR_VERTEX( + 0, + (SELECT count(a.id) FROM Point3 a), + sub.dense_id, + sub.cnt) + ) + FROM ( + SELECT a.rowid as dense_id, count(k.src) as cnt + FROM Point3 a + LEFT JOIN know3 k ON k.src = a.id + GROUP BY a.rowid) sub + ) + AS BIGINT), + a.rowid, + c.rowid, + k.rowid) as temp + FROM know3 k + JOIN Point3 a on a.id = k.src + JOIN Point3 c on c.id = k.dst +) SELECT a.id, b.id, iterativelength(0, (select count(*) from Point3), a.rowid, b.rowid, 2, 3) as path_length + FROM Point3 a, Point3 b, (select count(cte1.temp) * 0 as temp from cte1) __x + WHERE a.id = 0 and __x.temp * 0 + iterativelength(0, (select count(*) from Point3), a.rowid, b.rowid, 2, 3); +---- +