Merge branch 'master' into Brew-release

RumbleDB · Oct 28, 2024 · 364fbdf · 364fbdf
2 parents 73fc48e + 1f52989
commit 364fbdf
Show file tree

Hide file tree

Showing 395 changed files with 8,225 additions and 2,211 deletions.
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -115,3 +115,26 @@ jobs:
     - name: MLTestsNativeDeactivated
       run: mvn -Dtest=MLTestsNativeDeactivated test
 
+  tests4:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Java 11
+      uses: actions/setup-java@v3
+      with:
+        java-version: 11
+        distribution: adopt
+    - name: Cache Maven packages
+      uses: actions/cache@v3
+      with:
+        path: ~/.m2
+        key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+        restore-keys: ${{ runner.os }}-m2
+    - name: Install with Maven
+      run: mvn install -DskipTests -Dgpg.skip --quiet
+    - name: Compile with Maven
+      run: mvn clean compile assembly:single
+    - name: DeltaUpdateRuntimeTests
+      run: mvn -Dtest=DeltaUpdateRuntimeTests test
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -79,8 +79,13 @@ NativeFLWORRuntimeTestsParallelismDeactivated:
   script:
     - mvn -Dtest=NativeFLWORRuntimeTestsParallelismDeactivated test
 
-StaticTypingTest:
-  stage: tests3
+updatedeltaruntime-test:
+  stage: test
+  script:
+    - mvn -Dtest=DeltaUpdateRuntimeTests test
+
+statictyping-test:
+  stage: test
   script:
     - mvn -Dtest=StaticTypeTests test
 

diff --git a/docs/Getting started.md b/docs/Getting started.md
@@ -78,11 +78,11 @@ If you use Linux, Florian Kellner also kindly contributed an [installation scrip
 
 RumbleDB requires an Apache Spark installation on Linux, Mac or Windows.
 
-It is straightforward to directly [download it](https://spark.apache.org/downloads.html), unpack it and put it at a location of your choosing. We recommend to pick Spark 3.2.2. Let us call this location SPARK_HOME (it is a good idea, in fact to also define an environment variable SPARK_HOME pointing to the absolute path of this location).
+It is straightforward to directly [download it](https://spark.apache.org/downloads.html), unpack it and put it at a location of your choosing. We recommend to pick Spark 3.4.3. Let us call this location SPARK_HOME (it is a good idea, in fact to also define an environment variable SPARK_HOME pointing to the absolute path of this location).
 
 What you need to do then is to add the subdirectory "bin" within the unpacked directory to the PATH variable. On macOS this is done by adding
 
-    export SPARK_HOME=/path/to/spark-3.2.2-bin-hadoop3.2
+    export SPARK_HOME=/path/to/spark-3.4.3-bin-hadoop3.2
     export PATH=$SPARK_HOME/bin:$PATH
 
 (with SPARK_HOME appropriately set to match your unzipped Spark directory) to the file .zshrc in your home directory, then making sure to force the change with
@@ -111,9 +111,11 @@ Like Spark, RumbleDB is just a download and no installation is required.
 
 In order to run RumbleDB, you simply need to download one of the small .jar files from the [download page](https://github.com/RumbleDB/rumble/releases) and put it in a directory of your choice, for example, right besides your data.
 
-If you use Spark 3.2+, use rumbledb-1.22.0-for-spark-3.2.jar.
+If you use Spark 3.4+, use rumbledb-1.22.0-for-spark-3.4.jar.
 
-If you use Spark 3.3+, use rumbledb-1.22.0-for-spark-3.3.jar.
+If you use Spark 3.5+, use rumbledb-1.22.0-for-spark-3.5.jar.
+
+If you use Spark 4.0+ (preview), use rumbledb-1.22.0-for-spark-4.0.jar.
 
 These jars do not embed Spark, since you chose to set it up separately. They will work with your Spark installation with the spark-submit command.
 

diff --git a/docs/install.md b/docs/install.md
@@ -7,9 +7,9 @@ We show here how to install RumbleDB from the github repository if you wish to d
 The following software is required:
 
 - [Java SE](http://www.oracle.com/technetwork/java/javase/downloads/index.html) 8 (last tested on OpenJDK 8u251). The version of Java is important, as Spark only works with Java 8 or java 11.
-- [Spark](https://spark.apache.org/), version 3.1.2 (for example)
+- [Spark](https://spark.apache.org/), version 3.4.3 (for example)
 - [Ant](http://www.ant.org/), version 1.11.1
-- [ANTLR](http://www.ant.org/), version 4.8 (supplied in our repository)
+- [ANTLR](http://www.ant.org/), version 4.9.3 (supplied in our repository)
 - [Maven](https://maven.apache.org/) 3.6.0
 
 Important: the ANTLR version varies with the Spark version, because Spark is also shipped with an ANTLR runtime (example: Spark 3.0 and 3.1 is with ANTLR 4.7, Spark 3.2 with ANTLR 4.8). The ANTLR runtime MUST match the ANTLR generator used to generate the RumbleDB jar file.

diff --git a/pom.xml b/pom.xml
@@ -197,6 +197,16 @@
     </build>
 
     <dependencies>
+        <dependency>
+            <groupId>org.openjdk.jmh</groupId>
+            <artifactId>jmh-core</artifactId>
+            <version>1.37</version>
+        </dependency>
+        <dependency>
+            <groupId>org.openjdk.jmh</groupId>
+            <artifactId>jmh-generator-annprocess</artifactId>
+            <version>1.37</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_2.12</artifactId>
@@ -257,36 +267,46 @@
             <artifactId>commons-lang3</artifactId>
             <version>3.12.0</version>
         </dependency>
-		<dependency>
+        <dependency>
+            <groupId>commons-net</groupId>
+            <artifactId>commons-net</artifactId>
+            <version>3.1</version>
+        </dependency>
+        <dependency>
             <groupId>commons-io</groupId>
             <artifactId>commons-io</artifactId>
             <version>2.11.0</version>
         </dependency>
-		<dependency>
-		  <groupId>org.apache.httpcomponents</groupId>
-		  <artifactId>httpclient</artifactId>
-		  <version>4.5.13</version>
-		</dependency>
-		<!--<dependency>
-		    <groupId>edu.vanderbilt.accre</groupId>
-		    <artifactId>laurelin</artifactId>
-		    <version>1.0.1</version>
-		</dependency>-->
-		<dependency>
-			<groupId>org.jgrapht</groupId>
-			<artifactId>jgrapht-core</artifactId>
-			<version>1.4.0</version>
-		</dependency>
-		<dependency>
-			<groupId>joda-time</groupId>
-			<artifactId>joda-time</artifactId>
-			<version>2.10.6</version>
-		</dependency>
-		<dependency>
-		  <groupId>com.fasterxml.jackson.dataformat</groupId>
-		  <artifactId>jackson-dataformat-yaml</artifactId>
-		  <version>2.13.4</version>
-		</dependency>
+        <dependency>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpclient</artifactId>
+            <version>4.5.13</version>
+        </dependency>
+	<!--<dependency>
+	    <groupId>edu.vanderbilt.accre</groupId>
+	    <artifactId>laurelin</artifactId>
+	    <version>1.0.1</version>
+	</dependency>-->
+        <dependency>
+            <groupId>org.jgrapht</groupId>
+            <artifactId>jgrapht-core</artifactId>
+            <version>1.4.0</version>
+        </dependency>
+        <dependency>
+            <groupId>joda-time</groupId>
+            <artifactId>joda-time</artifactId>
+            <version>2.10.6</version>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.dataformat</groupId>
+            <artifactId>jackson-dataformat-yaml</artifactId>
+            <version>2.13.4</version>
+        </dependency>
+        <dependency>
+            <groupId>io.delta</groupId>
+            <artifactId>delta-core_2.12</artifactId>
+            <version>2.4.0</version>
+        </dependency>
     </dependencies>
 
     <distributionManagement>

diff --git a/src/main/java/org/rumbledb/api/Item.java b/src/main/java/org/rumbledb/api/Item.java
@@ -692,17 +692,113 @@ default boolean isNaN() {
      * @return an int representing nestedness of the item inside transform expressions.
      */
     default int getMutabilityLevel() {
-        return 0;
+        return -1;
     }
 
     /**
-     * Sets the mutability level of the item.
+     * Sets the mutability level of the item to a supplied value.
      *
-     * @param mutabilityLevel the new mutability level.
+     * @param mutabilityLevel new mutability level.
      */
     default void setMutabilityLevel(int mutabilityLevel) {
     }
 
+    /**
+     * Returns the top level ID of the item.
+     *
+     * @return int representing the rowID of the item within a DeltaFile.
+     */
+    default long getTopLevelID() {
+        return -1;
+    }
+
+    /**
+     * Sets the top level ID of the item to a supplied value.
+     *
+     * @param topLevelID new top level ID.
+     */
+    default void setTopLevelID(long topLevelID) {
+    }
+
+    /**
+     * Returns the path from the top level object of a DeltaFile for the item.
+     *
+     * @return String representing the path of the item from the top level within a DeltaFile.
+     */
+    default String getPathIn() {
+        return "null";
+    }
+
+    /**
+     * Sets the path from the top level object of a DeltaFile for the item to a supplied value.
+     *
+     * @param pathIn new path from top level.
+     */
+    default void setPathIn(String pathIn) {
+    }
+
+    /**
+     * Returns the location of the DeltaFile for the item.
+     *
+     * @return String representing the location of the DeltaFile for the item.
+     */
+    default String getTableLocation() {
+        return null;
+    }
+
+
+    /**
+     * Sets the location of the DeltaFile for the item to a supplied value.
+     *
+     * @param location new location of the DeltaFile for the item.
+     */
+    default void setTableLocation(String location) {
+    }
+
+    /**
+     * Returns the SparkSQL value of the item for use in a query.
+     *
+     * @return String representing the SparkSQL value of the item.
+     */
+    default String getSparkSQLValue() {
+        throw new UnsupportedOperationException("Operation not defined for type " + this.getDynamicType());
+    }
+
+    /**
+     * Returns the SparkSQL value of the item for use in a query.
+     *
+     * @return String representing the SparkSQL value of the item.
+     */
+    default String getSparkSQLValue(ItemType itemType) {
+        throw new UnsupportedOperationException("Operation not defined for type " + this.getDynamicType());
+    }
+
+    /**
+     * Returns the SparkSQL type of the item for use in a query.
+     *
+     * @return String representing the SparkSQL type of the item.
+     */
+    default String getSparkSQLType() {
+        throw new UnsupportedOperationException("Operation not defined for type " + this.getDynamicType());
+    }
+
+    /**
+     * Tests for physical equality. The semantics are that of the eq operator.
+     *
+     * @param other another item.
+     * @return true it is equal to other, false otherwise.
+     */
+    default boolean physicalEquals(Object other) {
+        if (!(other instanceof Item)) {
+            return false;
+        }
+        Item otherItem = (Item) other;
+        if (this.getTopLevelID() == -1 || otherItem.getTopLevelID() == -1) {
+            return System.identityHashCode(this) == System.identityHashCode(otherItem);
+        }
+        return this.getTopLevelID() == otherItem.getTopLevelID() && this.getPathIn().equals(otherItem.getPathIn());
+    }
+
     /**
      * Tests for logical equality. The semantics are that of the eq operator.
      *

diff --git a/src/main/java/org/rumbledb/api/Rumble.java b/src/main/java/org/rumbledb/api/Rumble.java
@@ -5,7 +5,6 @@
 import org.rumbledb.context.DynamicContext;
 import org.rumbledb.expressions.module.MainModule;
 import org.rumbledb.runtime.RuntimeIterator;
-import org.rumbledb.runtime.update.PendingUpdateList;
 import sparksoniq.spark.SparkSessionManager;
 
 import java.io.IOException;
@@ -52,11 +51,6 @@ public SequenceOfItems runQuery(String query) {
             this.configuration
         );
 
-        if (iterator.isUpdating()) {
-            PendingUpdateList pul = iterator.getPendingUpdateList(dynamicContext);
-            pul.applyUpdates(iterator.getMetadata());
-        }
-
         return new SequenceOfItems(iterator, dynamicContext, this.configuration);
     }
 
@@ -78,12 +72,6 @@ public SequenceOfItems runQuery(URI location) throws IOException {
             this.configuration
         );
 
-        if (iterator.isUpdating()) {
-            PendingUpdateList pul = iterator.getPendingUpdateList(dynamicContext);
-            pul.applyUpdates(iterator.getMetadata());
-        }
-
-        System.err.println("final iterator is: " + iterator.isUpdating());
         return new SequenceOfItems(iterator, dynamicContext, this.configuration);
     }