suaaa7 · suaaa7 · Dec 31, 2019 · Dec 22, 2019 · Dec 22, 2019 · Dec 28, 2019
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,23 @@
 *.class
 *.log
+
+# sbt specific
+.cache
+.history
+.lib
+dist/*
+target/
+lib_managed/
+src_managed/
+project/boot/
+project/plugins/project/
+
+# Scala-IDE specific
+.scala_dependencies
+.worksheet
+
+# Docker cache
+.cache
+.ivy2
+.sbt
+.bash_history
diff --git a/.jvmopts b/.jvmopts
@@ -0,0 +1,5 @@
+-Xms1024m
+-Xmx2048m
+-XX:ReservedCodeCacheSize=128m
+-XX:MaxMetaspaceSize=256m
+-Xss2m
diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -0,0 +1,11 @@
+style: defaultWithAlign
+maxColumn: 80
+docstrings: JavaDoc
+align: most
+rewrite {
+  rules: [AvoidInfix, PreferCurlyFors, RedundantBraces, RedundantParens, SortImports]
+  redundantBraces.maxLines: 10
+}
+align.openParenCallSite: false
+align.openParenDefnSite: false
+danglingParentheses: true
diff --git a/Makefile b/Makefile
@@ -0,0 +1,40 @@
+.PHONY: dbuild
+dbuild:
+	docker build \
+		--build-arg BASE_IMAGE_TAG="8u212-b04-jdk-stretch" \
+		--build-arg SBT_VERSION="1.3.5" \
+		--build-arg SCALA_VERSION="2.13.1" \
+		--build-arg USER_ID=1001 \
+		--build-arg GROUP_ID=1001 \
+		-t hseeberger/scala-sbt \
+		github.com/hseeberger/scala-sbt.git#:debian
+
+.PHONY: dbash
+dbash:
+	docker run --rm -it -v `pwd`:/root hseeberger/scala-sbt bash
+
+.PHONY: dassembly
+dassembly:
+	docker run --rm -it -v `pwd`:/root hseeberger/scala-sbt sbt assembly
+
+.PHONY: upload-jar
+upload-jar:
+	aws s3 cp \
+		./batch/target/scala-2.11/batch-assembly-*.jar \
+		s3://$${BUCKET_NAME}/spark/
+
+.PHONY: upload-csv
+upload-csv:
+	aws s3 cp \
+		./data/train_data_v1.csv \
+		s3://$${BUCKET_NAME}/spark/data/
+
+.PHONY: add-steps
+add-steps:
+	aws emr add-steps --cluster-id $${CLUSTER_ID} --steps \
+		Type=CUSTOM_JAR,Name=SparkMLLr,ActionOnFailure=CONTINUE,Jar=command-runner.jar,Args=[spark-submit,--class,spark.ml.lr.SparkMLLrBatch,--deploy-mode,cluster,--master,yarn,--conf,'spark.executor.extraJavaOptions=-Dconfig.resource=dev.conf',--conf,'spark.driver.extraJavaOptions=-Dconfig.resource=dev.conf',s3://$${BUCKET_NAME}/spark/batch-assembly-0.1.0-SNAPSHOT.jar]
+
+.PHONY: checkenv
+checkenv:
+	@echo $${BUCKET_NAME}
+	@echo $${CLUSTER_ID}
diff --git a/batch/src/main/resources/application.conf b/batch/src/main/resources/application.conf
@@ -0,0 +1,7 @@
+models {
+  v1 {
+    modelPath: "spark/model/model_v1"
+    trainDataPath: "spark/data/train_data_v1.csv"
+    modelName: "v1"
+  }
+}
diff --git a/batch/src/main/resources/dev.conf b/batch/src/main/resources/dev.conf
@@ -0,0 +1,7 @@
+include "application.conf"
+
+environment: "dev"
+
+s3 {
+  bucketName: "emr-spark-ap-northeast-1-dev"
+}
diff --git a/batch/src/main/resources/log4j-spark.properties b/batch/src/main/resources/log4j-spark.properties
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+log4j.rootCategory=INFO, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
+
+# Set the default spark-shell log level to WARN. When running the spark-shell, the
+# log level for this class is used to overwrite the root logger's log level, so that
+# the user can have different defaults for the shell and regular Spark apps.
+log4j.logger.org.apache.spark.repl.Main=WARN
+
+# Settings to quiet third party logs that are too verbose
+log4j.logger.org.sparkproject.jetty=WARN
+log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
+log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
+log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs
+# in SparkSQL with Hive support
+log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
+log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
+
+# Parquet related logging
+log4j.logger.org.apache.parquet.CorruptStatistics=ERROR
+log4j.logger.parquet.CorruptStatistics=ERROR
diff --git a/batch/src/main/scala/spark/ml/Config.scala b/batch/src/main/scala/spark/ml/Config.scala
@@ -0,0 +1,42 @@
+package spark.ml
+
+import spark.ml.entity.Environment
+import spark.ml.Config.{ModelGroupsConfig, S3Config}
+import pureconfig._
+import pureconfig.generic.auto._
+import pureconfig.generic.ProductHint
+
+final case class Config(
+    environment: Environment,
+    models: ModelGroupsConfig,
+    s3: S3Config
+)
+
+object Config {
+  implicit val environmentConvert: ConfigConvert[Environment] =
+    ConfigConvert.viaNonEmptyStringTry[Environment](
+      s => Environment.fromString(s).asScala,
+      e => e.toString
+    )
+
+  def load: Config = {
+    implicit def hint[T]: ProductHint[T] =
+      ProductHint[T](ConfigFieldMapping(CamelCase, CamelCase))
+
+    ConfigSource.default.loadOrThrow[Config]
+  }
+
+  final case class ModelGroupsConfig(
+      v1: LrModelConfig
+  )
+
+  final case class LrModelConfig(
+      modelPath: String,
+      trainDataPath: String,
+      modelName: String
+  )
+
+  final case class S3Config(
+      bucketName: String
+  )
+}
diff --git a/batch/src/main/scala/spark/ml/entity/Environment.scala b/batch/src/main/scala/spark/ml/entity/Environment.scala
@@ -0,0 +1,19 @@
+package spark.ml.entity
+
+import com.twitter.util._
+
+sealed abstract class Environment(s: String) {
+  override val toString: String = s
+}
+
+object Environment {
+  case object Dev  extends Environment("dev")
+  case object Prod extends Environment("prod")
+
+  def fromString(s: String): Try[Environment] =
+    s match {
+      case Dev.toString  => Return(Dev)
+      case Prod.toString => Return(Prod)
+      case _             => Throw(new Exception(s"Invalid environment: $s"))
+    }
+}
diff --git a/batch/src/main/scala/spark/ml/entity/Features.scala b/batch/src/main/scala/spark/ml/entity/Features.scala
@@ -0,0 +1,33 @@
+package spark.ml.entity
+
+import org.apache.spark.ml.linalg.Vector
+
+object Features {
+  val catFeatures = Array(
+    "uid",
+    "hour",
+    "advertiserId",
+    "campaignId",
+    "adId",
+    "siteId",
+    "c1",
+    "c2"
+  )
+
+  val concatFeatures = Array(
+    Array("campaignId", "adId") -> "ca",
+    Array("c1", "c2")           -> "cc"
+  )
+
+  val isNotNullFeatures = Array(
+    "c3"
+  )
+
+  val quaFeatures = Array(
+    "n1"
+  )
+
+  val logFeatures = Array(
+    "n2"
+  )
+}
diff --git a/batch/src/main/scala/spark/ml/entity/TrainData.scala b/batch/src/main/scala/spark/ml/entity/TrainData.scala
@@ -0,0 +1,38 @@
+package spark.ml.entity
+
+import org.apache.spark.sql.types._
+
+final case class TrainData(
+    label: Double,
+    uid: String,
+    hour: Int,
+    advertiserId: Int,
+    campaignId: Int,
+    adId: Int,
+    siteId: Int,
+    c1: Int,
+    c2: Int,
+    n1: Double,
+    n2: Double,
+    c3: Int
+)
+
+object TrainData {
+  val schema: StructType =
+    StructType(
+      Array(
+        StructField("label", DoubleType, false),
+        StructField("uid", StringType, false),
+        StructField("hour", IntegerType, false),
+        StructField("advertiserId", IntegerType, false),
+        StructField("campaignId", IntegerType, false),
+        StructField("adId", IntegerType, false),
+        StructField("siteId", IntegerType, false),
+        StructField("c1", IntegerType, false),
+        StructField("c2", IntegerType, false),
+        StructField("n1", DoubleType, false),
+        StructField("n2", DoubleType, false),
+        StructField("c3", IntegerType, false)
+      )
+    )
+}
diff --git a/batch/src/main/scala/spark/ml/lr/SparkMLLrBatch.scala b/batch/src/main/scala/spark/ml/lr/SparkMLLrBatch.scala
@@ -0,0 +1,111 @@
+package spark.ml.lr
+
+import com.twitter.app.App
+import com.twitter.logging.Logging
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.feature.{
+  Imputer,
+  OneHotEncoderEstimator,
+  StringIndexer,
+  VectorAssembler
+}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.types._
+import spark.ml.config
+import spark.ml.entity.{Features, TrainData}
+import spark.ml.transformer.{
+  ConcatTransformer,
+  IsNotNullTransformer,
+  LogarithmicTransformer
+}
+
+object SparkMLLrBatch extends App with Logging {
+  def main(): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName("SparkMLLrBatch")
+      .getOrCreate()
+
+    log.info("Batch Started")
+
+    val trainDF = spark.read
+      .format("com.databricks.spark.csv")
+      .option("header", "false")
+      .schema(TrainData.schema)
+      .load(s"s3a://${config.s3.bucketName}/${config.models.v1.trainDataPath}")
+
+    val concaters = Features.concatFeatures.map { feature =>
+      new ConcatTransformer()
+        .setInputCols(feature._1)
+        .setOutputCol(feature._2)
+    }
+
+    val indexers =
+      (Features.catFeatures ++ (Features.concatFeatures.map(_._2))).map {
+        name =>
+          new StringIndexer()
+            .setInputCol(name)
+            .setOutputCol(s"${name}_indexed")
+            .setHandleInvalid("keep")
+      }
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(indexers.map(_.getOutputCol))
+      .setOutputCols(
+        (Features.catFeatures ++ (Features.concatFeatures.map(_._2)))
+          .map(name => s"${name}_processed")
+      )
+
+    val isNotNuller = new IsNotNullTransformer()
+      .setInputCols(Features.isNotNullFeatures)
+      .setOutputCols(
+        Features.isNotNullFeatures.map(name => "${name}_processed")
+      )
+
+    val logger = new LogarithmicTransformer()
+      .setInputCols(Features.logFeatures)
+      .setOutputCols(Features.logFeatures.map(name => s"${name}_log"))
+
+    val imputer = new Imputer()
+      .setInputCols(Features.quaFeatures ++ logger.getOutputCols)
+      .setOutputCols(
+        (Features.quaFeatures ++ Features.logFeatures)
+          .map(name => s"${name}_processed")
+      )
+
+    val assembler = new VectorAssembler()
+      .setInputCols(
+        encoder.getOutputCols ++ isNotNuller.getOutputCols ++ imputer.getOutputCols
+      )
+      .setOutputCol("features")
+
+    val lr = new LogisticRegression()
+      .setMaxIter(100)
+      .setRegParam(0.001)
+      .setStandardization(false)
+
+    val stages = concaters ++ indexers ++ Array(
+      encoder,
+      isNotNuller,
+      logger,
+      imputer,
+      assembler,
+      lr
+    )
+
+    val pipeline = new Pipeline()
+      .setStages(stages)
+
+    val model = pipeline.fit(trainDF)
+    model.write
+      .overwrite()
+      .save(
+        s"s3a://${config.s3.bucketName}/${config.models.v1.modelPath}"
+      )
+
+    log.info("Batch Completed")
+
+    spark.stop()
+  }
+}
diff --git a/batch/src/main/scala/spark/ml/package.scala b/batch/src/main/scala/spark/ml/package.scala
@@ -0,0 +1,5 @@
+package spark
+
+package object ml {
+  lazy val config: Config = Config.load
+}