Merge branch 'master' into Spark3.5.0

RumbleDB · Feb 27, 2024 · f745e31 · f745e31
2 parents 524667d + 1efe9c7
commit f745e31
Show file tree

Hide file tree

Showing 10 changed files with 3,295 additions and 15 deletions.
diff --git a/FLWOR.ipynb b/FLWOR.ipynb
diff --git a/README.md b/README.md
@@ -16,3 +16,4 @@ The documentation also contains an introduction specific to RumbleDB and how you
 
 [The documentation of the current master (for the adventurous and curious) is available here.](http://sparksoniq.readthedocs.io/en/latest/)
 
+RumbleDB is an effort involving many researchers and ETH Zurich students: code and support by Stefan Irimescu, Ghislain Fourny, Gustavo Alonso, Renato Marroquin, Rodrigo Bruno, Falko Noé, Ioana Stefan, Andrea Rinaldi, Stevan Mihajlovic, Mario Arduini, Can Berker Çıkış, Elwin Stephan, David Dao, Zirun Wang, Ingo Müller, Dan-Ovidiu Graur, Thomas Zhou, Olivier Goerens, Alexandru Meterez, Remo Röthlisberger, Dominik Bruggisser, David Loughlin.
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/EstimatorTests/MLEstimator-LDA.jq b/src/test/resources/test_files/RumbleML/RumbleML/EstimatorTests/MLEstimator-LDA.jq
@@ -1,4 +1,6 @@
-(:JIQS: ShouldRun; Output="({ "label" : 0, "name" : "a", "age" : 20, "weight" : 50, "topicDistribution" : [ 0.9915007293228202, 0.0084992706771797 ] }, { "label" : 1, "name" : "b", "age" : 21, "weight" : 55.3, "topicDistribution" : [ 0.9922903440390959, 0.007709655960904113 ] }, { "label" : 2, "name" : "c", "age" : 22, "weight" : 60.6, "topicDistribution" : [ 0.9929414141611251, 0.007058585838874805 ] }, { "label" : 3, "name" : "d", "age" : 23, "weight" : 65.9, "topicDistribution" : [ 0.9934942226012154, 0.006505777398784658 ] }, { "label" : 4, "name" : "e", "age" : 24, "weight" : 70.3, "topicDistribution" : [ 0.9938860832566909, 0.006113916743309047 ] }, { "label" : 5, "name" : "f", "age" : 25, "weight" : 75.6, "topicDistribution" : [ 0.9943034888595204, 0.005696511140479687 ] })" :)
+(:JIQS: ShouldRun; Output="({ "label" : 0, "name" : "a", "age" : 20, "weight" : 50, "topicDistribution" : [ 0.99150074, 0.00849927 ] }, { "label" : 1, "name" : "b", "age" : 21, "weight" : 55.3, "topicDistribution" : [ 0.9922903, 0.007709656 ] }, { "label" : 2, "name" : "c", "age" : 22, "weight" : 60.6, "topicDistribution" : [ 0.99294144, 0.007058586 ] }, { "label" : 3, "name" : "d", "age" : 23, "weight" : 65.9, "topicDistribution" : [ 0.9934942, 0.0065057776 ] }, { "label" : 4, "name" : "e", "age" : 24, "weight" : 70.3, "topicDistribution" : [ 0.99388605, 0.0061139166 ] }, { "label" : 5, "name" : "f", "age" : 25, "weight" : 75.6, "topicDistribution" : [ 0.99430346, 0.0056965114 ] })" :)
+
+
 let $data := annotate(
     json-file("../../../../queries/rumbleML/sample-ml-data-flat.json"),
     { "label": "integer", "binaryLabel": "integer", "name": "string", "age": "double", "weight": "double", "booleanCol": "boolean", "nullCol": "null", "stringCol": "string", "stringArrayCol": ["string"], "intArrayCol": ["integer"],  "doubleArrayCol": ["double"],  "doubleArrayArrayCol": [["double"]] }
@@ -20,5 +22,5 @@ return {
     "name": $result.name,
     "age": $result.age,
     "weight": $result.weight,
-    "topicDistribution": $result.topicDistribution
+    "topicDistribution": [ for $v in $result.topicDistribution[] return float($v) ]
 }
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/EstimatorTests/MLEstimator-Word2Vec.jq b/src/test/resources/test_files/RumbleML/RumbleML/EstimatorTests/MLEstimator-Word2Vec.jq
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline1.jq b/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline1.jq
@@ -1,4 +1,16 @@
-(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "features2" : [ 0.6811443247067493, -1.6574062675838137 ], "rawPrediction" : [ -37.80676708784381, 37.80676708784381 ], "probability" : [ 3.8082870431051935E-17, 1 ], "ocol1" : 1 }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "features2" : [ -2.9389857055027706, 0.48441694826197507 ], "rawPrediction" : [ 37.567707092586666, -37.567707092586666 ], "probability" : [ 1, 0 ], "ocol1" : 0 }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "features2" : [ 0.28609394877170796, 1.1822492465709793 ], "rawPrediction" : [ -19.168353441008513, 19.168353441008513 ], "probability" : [ 4.734671708097413E-9, 0.9999999952653283 ], "ocol1" : 1 })" :)
+(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "ocol1" : 1, "features2" : [ 0.6811443, -1.6574062 ], "rawPrediction" : [ -37.806767, 37.806767 ], "probability" : [ 3.808287E-17, 1 ] }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "ocol1" : 0, "features2" : [ -2.9389858, 0.48441696 ], "rawPrediction" : [ 37.567707, -37.567707 ], "probability" : [ 1, 0 ] }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "ocol1" : 1, "features2" : [ 0.28609395, 1.1822492 ], "rawPrediction" : [ -19.168354, 19.168354 ], "probability" : [ 4.7346718E-9, 1 ] })" :)
+
+declare function local:round($i as object) as object {
+  {|
+    remove-keys($i, ("features2", "rawPrediction", "probability")),
+    {
+      "features2" : [ for $v in $i.features2[] return float($v) ],
+      "rawPrediction" : [ for $v in $i.rawPrediction[] return float($v) ],
+      "probability" : [ for $v in $i.probability[] return float($v) ]
+    }
+  |}
+};
+
 let $vector-assembler := get-transformer("VectorAssembler")
 let $training-data := (
     {"id": 0, "label": 1, "col1": 0.0, "col2": 1.1, "col3": 0.1},
@@ -26,4 +38,4 @@ for $i in $trained_est2(
     $my-new-test-data,
     {"featuresCol": "features2", "predictionCol": "ocol1"}
 )
-return $i
+return local:round($i)
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline2.jq b/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline2.jq
@@ -1,4 +1,17 @@
-\1;95;0c(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "features2" : [ 0.6811443247067493, -1.6574062675838137 ], "rawPrediction" : [ -37.806767087840996, 37.806767087840996 ], "probability" : [ 3.808287043115909E-17, 1 ], "ocol1" : 1 }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "features2" : [ -2.9389857055027706, 0.48441694826197507 ], "rawPrediction" : [ 37.567707092585735, -37.567707092585735 ], "probability" : [ 1, 0 ], "ocol1" : 0 }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "features2" : [ 0.28609394877170796, 1.1822492465709793 ], "rawPrediction" : [ -19.168353441006627, 19.168353441006627 ], "probability" : [ 4.734671708106345E-9, 0.9999999952653283 ], "ocol1" : 1 })" :)
+(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "ocol1" : 1, "features2" : [ 0.6811443, -1.6574062 ], "rawPrediction" : [ -37.806767, 37.806767 ], "probability" : [ 3.808287E-17, 1 ] }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "ocol1" : 0, "features2" : [ -2.9389858, 0.48441696 ], "rawPrediction" : [ 37.567707, -37.567707 ], "probability" : [ 1, 0 ] }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "ocol1" : 1, "features2" : [ 0.28609395, 1.1822492 ], "rawPrediction" : [ -19.168354, 19.168354 ], "probability" : [ 4.7346718E-9, 1 ] })" :)
+
+declare function local:round($i as object) as object {
+  {|
+    remove-keys($i, ("features2", "rawPrediction", "probability")),
+    {
+      "features2" : [ for $v in $i.features2[] return float($v) ],
+      "rawPrediction" : [ for $v in $i.rawPrediction[] return float($v) ],
+      "probability" : [ for $v in $i.probability[] return float($v) ]
+    }
+  |}
+};
+
+
 let $vector-assembler := get-transformer("VectorAssembler")(?, { "inputCols" : [ "col1", "col2", "col3" ], "outputCol" : "features" })
 let $training-data := (
     {"id": 0, "label": 1, "col1": 0.0, "col2": 1.1, "col3": 0.1},
@@ -26,4 +39,4 @@ for $i in $trained_est2(
     $my-new-test-data,
     {"featuresCol": "features2", "predictionCol": "ocol1"}
 )
-return $i
+return local:round($i)
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline3.jq b/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline3.jq
@@ -33,4 +33,4 @@ for $i in $trained_est2(
     $my-new-test-data,
     {"featuresCol": "features2", "predictionCol": "ocol1"}
 )
-return $i
+return local:round($i)
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline4.jq b/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline4.jq
@@ -1,4 +1,15 @@
-(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "rawPrediction" : [ -38.071372676777095, 38.071372676777095 ], "probability" : [ 2.9228930727481084E-17, 1 ], "prediction" : 1 }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "rawPrediction" : [ 36.2958259818859, -36.2958259818859 ], "probability" : [ 0.9999999999999998, 2.220446049250313E-16 ], "prediction" : 0 }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "rawPrediction" : [ -21.18841691705933, 21.18841691705933 ], "probability" : [ 6.280402132408788E-10, 0.9999999993719598 ], "prediction" : 1 })" :)
+(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "prediction" : 1, "features2" : [ ], "rawPrediction" : [ -38.071373, 38.071373 ], "probability" : [ 2.922893E-17, 1 ] }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "prediction" : 0, "features2" : [ ], "rawPrediction" : [ 36.295826, -36.295826 ], "probability" : [ 1, 2.220446E-16 ] }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "prediction" : 1, "features2" : [ ], "rawPrediction" : [ -21.188417, 21.188417 ], "probability" : [ 6.2804023E-10, 1 ] })" :)
+
+declare function local:round($i as object) as object {
+  {|
+    remove-keys($i, ("features2", "rawPrediction", "probability")),
+    {
+      "features2" : [ for $v in $i.features2[] return float($v) ],
+      "rawPrediction" : [ for $v in $i.rawPrediction[] return float($v) ],
+      "probability" : [ for $v in $i.probability[] return float($v) ]
+    }
+  |}
+};
 
 declare type local:mytype as  {"id": "integer", "label": "integer", "col1": "decimal", "col2": "decimal", "col3": "decimal"};
 
@@ -26,4 +37,5 @@ let $test-data := $vector-assembler(validate type local:mytype* {
 })
 
 let $pip := local:pipeline($training-data, ?)
-return $pip($test-data)
+for $i in $pip($test-data)
+return local:round($i)
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline5.jq b/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline5.jq
@@ -1,6 +1,17 @@
-(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "rawPrediction" : [ -38.07137267677909, 38.07137267677909 ], "probability" : [ 2.922893072742273E-17, 1 ], "prediction" : 1 }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "rawPrediction" : [ 36.29582598188342, -36.29582598188342 ], "probability" : [ 0.9999999999999998, 2.220446049250313E-16 ], "prediction" : 0 }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "rawPrediction" : [ -21.188416917062117, 21.188416917062117 ], "probability" : [ 6.280402132391294E-10, 0.9999999993719598 ], "prediction" : 1 })" :)
+(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "prediction" : 1, "features2" : [ ], "rawPrediction" : [ -38.071373, 38.071373 ], "probability" : [ 2.922893E-17, 1 ] }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "prediction" : 0, "features2" : [ ], "rawPrediction" : [ 36.295826, -36.295826 ], "probability" : [ 1, 2.220446E-16 ] }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "prediction" : 1, "features2" : [ ], "rawPrediction" : [ -21.188417, 21.188417 ], "probability" : [ 6.2804023E-10, 1 ] })" :)
 declare type local:mytype as  {"id": "integer", "label": "integer", "col1": "decimal", "col2": "decimal", "col3": "decimal"};
 
+declare function local:round($i as object) as object {
+  {|
+    remove-keys($i, ("features2", "rawPrediction", "probability")),
+    {
+      "features2" : [ for $v in $i.features2[] return float($v) ],
+      "rawPrediction" : [ for $v in $i.rawPrediction[] return float($v) ],
+      "probability" : [ for $v in $i.probability[] return float($v) ]
+    }
+  |}
+};
+
 let $vector-assembler := get-transformer("VectorAssembler", {"inputCols" : [ "col1", "col2", "col3" ], "outputCol" : "features"})
 let $logisticregression := get-estimator("LogisticRegression", { "featuresCol" : "features" })
 
@@ -20,4 +31,6 @@ let $test-data := validate type local:mytype* {
 }
 
 let $pip := $pipeline($training-data, {})
-return $pip($test-data, {})
+for $i in $pip($test-data, {})
+return local:round($i)
+
diff --git a/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline6.jq b/src/test/resources/test_files/RumbleML/RumbleML/MLPipeline6.jq
@@ -1,6 +1,17 @@
-(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "rawPrediction" : [ -38.071372676775525, 38.071372676775525 ], "probability" : [ 2.9228930727526986E-17, 1 ], "prediction" : 1 }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "rawPrediction" : [ 36.295825981888, -36.295825981888 ], "probability" : [ 0.9999999999999998, 2.220446049250313E-16 ], "prediction" : 0 }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "rawPrediction" : [ -21.18841691705713, 21.18841691705713 ], "probability" : [ 6.280402132422621E-10, 0.9999999993719598 ], "prediction" : 1 })" :)
+(:JIQS: ShouldRun; Output="({ "id" : 0, "label" : 1, "col1" : -1, "col2" : 1.5, "col3" : 1.3, "features" : [ -1, 1.5, 1.3 ], "prediction" : 1, "features2" : [ ], "rawPrediction" : [ -38.071373, 38.071373 ], "probability" : [ 2.922893E-17, 1 ] }, { "id" : 1, "label" : 0, "col1" : 3, "col2" : 2, "col3" : -0.1, "features" : [ 3, 2, -0.1 ], "prediction" : 0, "features2" : [ ], "rawPrediction" : [ 36.295826, -36.295826 ], "probability" : [ 1, 2.220446E-16 ] }, { "id" : 2, "label" : 1, "col1" : 0, "col2" : 2.2, "col3" : -1.5, "features" : [ 0, 2.2, -1.5 ], "prediction" : 1, "features2" : [ ], "rawPrediction" : [ -21.188417, 21.188417 ], "probability" : [ 6.2804023E-10, 1 ] })" :)
 declare type local:mytype as  {"id": "integer", "label": "integer", "col1": "decimal", "col2": "decimal", "col3": "decimal"};
 
+declare function local:round($i as object) as object {
+  {|
+    remove-keys($i, ("features2", "rawPrediction", "probability")),
+    {
+      "features2" : [ for $v in $i.features2[] return float($v) ],
+      "rawPrediction" : [ for $v in $i.rawPrediction[] return float($v) ],
+      "probability" : [ for $v in $i.probability[] return float($v) ]
+    }
+  |}
+};
+
 let $pipeline := get-estimator("Pipeline", {
   "stages" : [
     get-transformer("VectorAssembler", {"inputCols" : [ "col1", "col2", "col3" ], "outputCol" : "features"}),
@@ -21,4 +32,6 @@ let $test-data := validate type local:mytype* {
 }
 
 let $pip := $pipeline($training-data, {})
-return $pip($test-data, {})
+for $i in $pip($test-data, {})
+return local:round($i)
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,3 +16,4 @@ The documentation also contains an introduction specific to RumbleDB and how you

		[The documentation of the current master (for the adventurous and curious) is available here.](http://sparksoniq.readthedocs.io/en/latest/)

		RumbleDB is an effort involving many researchers and ETH Zurich students: code and support by Stefan Irimescu, Ghislain Fourny, Gustavo Alonso, Renato Marroquin, Rodrigo Bruno, Falko Noé, Ioana Stefan, Andrea Rinaldi, Stevan Mihajlovic, Mario Arduini, Can Berker Çıkış, Elwin Stephan, David Dao, Zirun Wang, Ingo Müller, Dan-Ovidiu Graur, Thomas Zhou, Olivier Goerens, Alexandru Meterez, Remo Röthlisberger, Dominik Bruggisser, David Loughlin.