allwefantasy · AirToSupply · Nov 23, 2021
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ target/
 .idea/
 spark-binlog.iml
 release.sh
+*.iml
+*/*.iml
diff --git a/README.md b/README.md
@@ -23,16 +23,16 @@ MySQL Binlog:
 
 ```      
 groupId: tech.mlsql
-artifactId: mysql-binlog_2.11
-version: 1.0.4
+artifactId: mysql-binlog_2.12
+version: 1.0.5
 ```
 
 HBase WAL:
 
 ```      
 groupId: tech.mlsql
-artifactId: hbase-wal_2.11
-version: 1.0.4
+artifactId: hbase-wal_2.12
+version: 1.0.5
 ```
 
 ## Limitation
@@ -355,6 +355,7 @@ object Main{
 
 ```
 
+If you try the above code, it doesn't work! You need to check whether MySQL connector Java dependency (mysql-connector-java) is introduced.
 
 
 

diff --git a/binlog-common/pom.xml b/binlog-common/pom.xml
@@ -3,9 +3,9 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>spark-binlog_2.11</artifactId>
+        <artifactId>spark-binlog_2.12</artifactId>
         <groupId>tech.mlsql</groupId>
-        <version>1.0.4</version>
+        <version>1.0.5</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 

diff --git a/binlog-common/src/main/java/org/apache/spark/streaming/RawEvent.java b/binlog-common/src/main/java/org/apache/spark/streaming/RawEvent.java
@@ -1,6 +1,6 @@
 package org.apache.spark.streaming;
 
-import org.apache.spark.sql.sources.v2.reader.streaming.Offset;
+import org.apache.spark.sql.execution.streaming.Offset;
 
 import java.io.Serializable;
 

diff --git a/binlog-common/src/main/java/tech/mlsql/binlog/common/CommonSourceOffset.scala b/binlog-common/src/main/java/tech/mlsql/binlog/common/CommonSourceOffset.scala
@@ -1,7 +1,8 @@
 package tech.mlsql.binlog.common
 
-import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
-import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2}
+import org.apache.spark.sql.connector.read.streaming.Offset
+import org.apache.spark.sql.execution.streaming.SerializedOffset
+import org.apache.spark.sql.execution.streaming.{Offset => OffsetV2}
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 

diff --git a/hbase-wal/pom.xml b/hbase-wal/pom.xml
@@ -3,9 +3,9 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>spark-binlog_2.11</artifactId>
+        <artifactId>spark-binlog_2.12</artifactId>
         <groupId>tech.mlsql</groupId>
-        <version>1.0.4</version>
+        <version>1.0.5</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
     <properties>

diff --git a/...-wal/src/main/java/org/apache/spark/sql/mlsql/sources/hbase/MLSQLHBaseWALDataSource.scala b/...-wal/src/main/java/org/apache/spark/sql/mlsql/sources/hbase/MLSQLHBaseWALDataSource.scala
@@ -133,7 +133,7 @@ case class MLSQLHBaseWAlSource(hostAndPort: ReportHostAndPort, spark: SparkSessi
         if (content(0) == 'v') {
           val indexOfNewLine = content.indexOf("\n")
           if (indexOfNewLine > 0) {
-            val version = parseVersion(content.substring(0, indexOfNewLine), VERSION)
+            val version = validateVersion(content.substring(0, indexOfNewLine), VERSION)
             CommonSourceOffset(SerializedOffset(content.substring(indexOfNewLine + 1)))
           } else {
             throw new IllegalStateException(

diff --git a/hbase-wal/src/main/java/org/apache/spark/sql/mlsql/sources/hbase/wal/HBaseWALClient.scala b/hbase-wal/src/main/java/org/apache/spark/sql/mlsql/sources/hbase/wal/HBaseWALClient.scala
@@ -4,17 +4,16 @@ import java.io.EOFException
 import java.nio.charset.Charset
 import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.AtomicBoolean
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hbase.client.{Delete, Put}
 import org.apache.hadoop.hbase.wal.{WAL, WALEdit, WALFactory}
 import org.apache.hadoop.hbase.{Cell, CellUtil}
 import org.apache.spark.sql.execution.streaming.LongOffset
-import org.apache.spark.sql.sources.v2.reader.streaming.Offset
+import org.apache.spark.sql.execution.streaming.Offset
 import org.apache.spark.streaming.RawEvent
-import org.spark_project.guava.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import tech.mlsql.binlog.common.HDFSContext
+import tech.mlsql.common.utils.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import tech.mlsql.common.utils.log.Logging
 
 import scala.collection.JavaConverters._

diff --git a/...in/java/org/apache/spark/sql/mlsql/sources/hbase/wal/HBaseWALSocketServerInExecutor.scala b/...in/java/org/apache/spark/sql/mlsql/sources/hbase/wal/HBaseWALSocketServerInExecutor.scala
@@ -4,10 +4,9 @@ import java.io.{DataInputStream, DataOutputStream}
 import java.util
 import java.util.concurrent.atomic.AtomicReference
 import java.util.regex.Pattern
-
 import org.apache.hadoop.conf.Configuration
 import org.apache.spark.SparkEnv
-import org.apache.spark.sql.execution.streaming.{LongOffset, Offset}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}
 import org.apache.spark.sql.mlsql.sources.hbase.wal.io.{DeleteWriter, PutWriter}
 import org.apache.spark.streaming.RawEvent
 import tech.mlsql.binlog.common.OriginalSourceServerInExecutor
@@ -83,6 +82,17 @@ class HBaseWALSocketServerInExecutor[T](taskContextRef: AtomicReference[T], chec
     jsonList
   }
 
+  /**
+   * Convert generic Offset to LongOffset if possible
+   * Note: Since spark 3.1 started, the object class of LongOffset removed the convert method and added this method for code consistency
+   * @return converted LongOffset
+   */
+  def convert(offset: Offset): Option[LongOffset] = offset match {
+    case lo: LongOffset => Some(lo)
+    case so: SerializedOffset => Some(LongOffset(so))
+    case _ => None
+  }
+
   override def process(dIn: DataInputStream, dOut: DataOutputStream): Unit = {
     client.readRequest(dIn) match {
       case _: NooopsRequest =>
@@ -93,7 +103,7 @@ class HBaseWALSocketServerInExecutor[T](taskContextRef: AtomicReference[T], chec
           flushAheadLog
         }
         val offsets = committedOffsets.asScala.
-          map(f => (f._1, (LongOffset.convert(f._2).get.offset+1).toString))
+          map(f => (f._1, (convert(f._2).get.offset+1).toString))
         client.sendResponse(dOut, OffsetResponse(offsets.toMap))
       case RequestData(name, startOffset, endOffset) =>
         try {
@@ -180,7 +190,7 @@ class HBaseWALSocketServerInExecutor[T](taskContextRef: AtomicReference[T], chec
     require(a != null || b != null, "two offsets should not be null at the same time ")
     if (a == null) true
     else if (b == null) false
-    else LongOffset.convert(a).get.offset < LongOffset.convert(b).get.offset
+    else convert(a).get.offset < convert(b).get.offset
   }
 }
 

diff --git a/...-wal/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/...-wal/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -0,0 +1 @@
+org.apache.spark.sql.mlsql.sources.hbase.MLSQLHBaseWALDataSource
diff --git a/mysql-binlog/pom.xml b/mysql-binlog/pom.xml
@@ -3,9 +3,9 @@
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <parent>
-        <artifactId>spark-binlog_2.11</artifactId>
+        <artifactId>spark-binlog_2.12</artifactId>
         <groupId>tech.mlsql</groupId>
-        <version>1.0.4</version>
+        <version>1.0.5</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
@@ -27,6 +27,11 @@
             <artifactId>mysql-binlog-connector-java</artifactId>
             <version>${binlog.version}</version>
         </dependency>
+        <dependency>
+            <groupId>joda-time</groupId>
+            <artifactId>joda-time</artifactId>
+            <version>2.9.9</version>
+        </dependency>
     </dependencies>
 
 </project>
diff --git a/mysql-binlog/src/main/java/org/apache/spark/sql/mlsql/sources/MLSQLBinLogDataSource.scala b/mysql-binlog/src/main/java/org/apache/spark/sql/mlsql/sources/MLSQLBinLogDataSource.scala
@@ -9,6 +9,7 @@ import java.util.{Locale, UUID}
 
 import com.github.shyiko.mysql.binlog.network.ServerException
 import org.apache.commons.io.IOUtils
+import org.apache.hadoop.fs.Path
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
@@ -23,7 +24,6 @@ import org.apache.spark.{SparkEnv, TaskContext}
 import tech.mlsql.common.utils.hdfs.HDFSOperator
 import tech.mlsql.common.utils.path.PathFun
 
-
 /**
  * This Datasource is used to consume MySQL binlog. Not support MariaDB yet because the connector we are using is
  * lack of the ability.
@@ -109,24 +109,31 @@ class MLSQLBinLogDataSource extends StreamSourceProvider with DataSourceRegister
     val checkPointDir = metadataPath.stripSuffix("/").split("/").
       dropRight(2).mkString("/")
 
-    def getOffsetFromCk = {
-      val offsetPath = PathFun(checkPointDir).
-        add("offsets").
-        toPath
+    def getOffsetFromCk: Option[Long] = {
+      // [bugfix] checkpointLocation:offsets is not a valid DFS filename. reason: File.pathSeparator is ':' or ';'
+      // val offsetPath = PathFun(checkPointDir).add("offsets").toPath
+      val offsetPath = new Path(checkPointDir.stripSuffix(Path.SEPARATOR), "offsets").toString
+      logInfo(s"from checkpoint accquire offsetPath: ${offsetPath}")
 
-      val lastFile = HDFSOperator.listFiles(offsetPath)
+      val files = HDFSOperator.listFiles(offsetPath)
+      if (files.isEmpty) {
+        logInfo(s"OffsetPath: ${offsetPath} checkpoint not found!")
+        return None
+      }
+      val lastFile = files
         .filterNot(f => f.getPath.getName.endsWith(".tmp.crc") || f.getPath.getName.endsWith(".tmp"))
-        .map { fileName =>
-          (fileName.getPath.getName.split("/").last.toInt, fileName.getPath)
-        }
-        .sortBy(f => f._1).last._2
-
+        .map { fileName => (fileName.getPath.getName.split("/").last.toInt, fileName.getPath) }
+        .sortBy(f => f._1)
+        .last._2
       val content = HDFSOperator.readFile(lastFile.toString)
-      content.split("\n").last.toLong
+      Some(content.split("\n").last.toLong)
     }
 
     val offsetFromCk = try {
-      Option(LongOffset(getOffsetFromCk))
+      getOffsetFromCk match {
+        case Some(checkponit) => Option(LongOffset(checkponit))
+        case _ => None
+      }
     } catch {
       case e: Exception =>
         logError(e.getMessage, e)
@@ -181,31 +188,32 @@ class MLSQLBinLogDataSource extends StreamSourceProvider with DataSourceRegister
           ex.printStackTrace()
         }
 
-        def sendStopBinlogServerRequest = {
-          // send signal to stop server
-          val socket2 = new Socket(executorBinlogServer.host, executorBinlogServer.port)
-          val dout2 = new DataOutputStream(socket2.getOutputStream)
-          BinLogSocketServerCommand.sendRequest(dout2,
-            ShutdownBinlogServer())
-          socket2.close()
-        }
-
+        /**
+         * Add callback logic code about closing binlog server when spark task goes wrong.
+         */
         TaskContext.get().addTaskFailureListener(new TaskFailureListener {
           override def onTaskFailure(context: TaskContext, error: Throwable): Unit = {
             taskContextRef.set(null)
-            sendStopBinlogServerRequest
-
+            val socket = new Socket(executorBinlogServer.host, executorBinlogServer.port)
+            val out = new DataOutputStream(socket.getOutputStream)
+            BinLogSocketServerCommand.sendRequest(out, ShutdownBinlogServer())
+            socket.close()
           }
         })
 
+        /**
+         * Add callback logic code about closing binlog server when spark task has been done.
+         */
         TaskContext.get().addTaskCompletionListener(new TaskCompletionListener {
           override def onTaskCompletion(context: TaskContext): Unit = {
             taskContextRef.set(null)
-            sendStopBinlogServerRequest
+            val socket = new Socket(executorBinlogServer.host, executorBinlogServer.port)
+            val out = new DataOutputStream(socket.getOutputStream)
+            BinLogSocketServerCommand.sendRequest(out, ShutdownBinlogServer())
+            socket.close()
           }
         })
 
-
         val socket = new Socket(tempSocketServerHost, tempSocketServerPort)
         val dout = new DataOutputStream(socket.getOutputStream)
         BinLogSocketServerCommand.sendRequest(dout,
@@ -252,7 +260,7 @@ class MLSQLBinLogDataSource extends StreamSourceProvider with DataSourceRegister
     MLSQLBinLogSource(executorBinlogServer, sqlContext.sparkSession, metadataPath, finalStartingOffsets, parameters ++ Map("binlogServerId" -> binlogServerId))
   }
 
-  override def shortName(): String = "mysql-binglog"
+  override def shortName(): String = "mysql-binlog"
 }
 
 /**
@@ -317,7 +325,7 @@ case class MLSQLBinLogSource(executorBinlogServer: ExecutorBinlogServer,
         if (content(0) == 'v') {
           val indexOfNewLine = content.indexOf("\n")
           if (indexOfNewLine > 0) {
-            val version = parseVersion(content.substring(0, indexOfNewLine), VERSION)
+            val version = validateVersion(content.substring(0, indexOfNewLine), VERSION)
             LongOffset(SerializedOffset(content.substring(indexOfNewLine + 1)))
           } else {
             throw new IllegalStateException(
@@ -348,6 +356,17 @@ case class MLSQLBinLogSource(executorBinlogServer: ExecutorBinlogServer,
     LongOffset(response.currentOffset)
   }
 
+  /**
+   * Convert generic Offset to LongOffset if possible
+   * Note: Since spark 3.1 started, the object class of LongOffset removed the convert method and added this method for code consistency
+   * @return converted LongOffset
+   */
+  def convert(offset: Offset): Option[LongOffset] = offset match {
+    case lo: LongOffset => Some(lo)
+    case so: SerializedOffset => Some(LongOffset(so))
+    case _ => None
+  }
+
   override def getOffset: Option[Offset] = {
     synchronized {
       if (initialized.compareAndSet(false, true)) {
@@ -368,7 +387,7 @@ case class MLSQLBinLogSource(executorBinlogServer: ExecutorBinlogServer,
 
     initialPartitionOffsets
 
-    val untilPartitionOffsets = LongOffset.convert(end)
+    val untilPartitionOffsets = convert(end)
 
     // On recovery, getBatch will get called before getOffset
     if (currentPartitionOffsets.isEmpty) {
@@ -383,10 +402,8 @@ case class MLSQLBinLogSource(executorBinlogServer: ExecutorBinlogServer,
     // once we have changed checkpoint path, then we can start from provided starting offset.
     // In normal case, we will recover the start from checkpoint offset directory
     val fromPartitionOffsets = start match {
-      case Some(prevBatchEndOffset) =>
-        LongOffset.convert(prevBatchEndOffset)
-      case None =>
-        Some(initialPartitionOffsets)
+      case Some(prevBatchEndOffset) => convert(prevBatchEndOffset)
+      case None => Some(initialPartitionOffsets)
     }
 
     val executorBinlogServerCopy = executorBinlogServer.copy()

diff --git a/...n/java/org/apache/spark/sql/mlsql/sources/mysql/binlog/BinLogSocketServerInExecutor.scala b/...n/java/org/apache/spark/sql/mlsql/sources/mysql/binlog/BinLogSocketServerInExecutor.scala
@@ -180,7 +180,7 @@ class BinLogSocketServerInExecutor[T](taskContextRef: AtomicReference[T],
 
     val eventDeserializer = new EventDeserializer()
     eventDeserializer.setCompatibilityMode(
-      //EventDeserializer.CompatibilityMode.DATE_AND_TIME_AS_LONG,
+      //EventDeserializer.CompatibilityMode.DATE_AND_TIME_AS_LONG_MICRO,
       EventDeserializer.CompatibilityMode.CHAR_AND_BINARY_AS_BYTE_ARRAY
       //EventDeserializer.CompatibilityMode.INVALID_DATE_AND_TIME_AS_MIN_VALUE
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,5 @@ target/ @@
     .idea/
     spark-binlog.iml
     release.sh
+    *.iml
+    */*.iml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		org.apache.spark.sql.mlsql.sources.hbase.MLSQLHBaseWALDataSource