dbpedia · gone-phishing · Apr 7, 2015 · Apr 7, 2015 · Apr 9, 2015 · Apr 9, 2015
diff --git a/README.md b/README.md
@@ -8,14 +8,15 @@ This is currently a work-in-progress, and the instructions are mostly intended f
 ## Requirements
 * Java 7
 * Maven 3
-* Apache Spark 0.9.1 built with Apache Hadoop 2.2.0
+* Apache Spark 1.3.0 built with Apache Hadoop 2.5.2
+* Scala 2.11.4
 
 ## Setup Apache Spark
 
 ```bash
-$ wget http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz
-$ tar xzf http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz
-$ cd spark-0.9.1-bin-hadoop2
+$ wget https://www.apache.org/dist/spark/spark-1.3.0/spark-1.3.0-bin-hadoop2.4.tgz 
+$ tar -zxzf https://www.apache.org/dist/spark/spark-1.3.0/spark-1.3.0-bin-hadoop2.4.tgz
+$ cd spark-1.3.0-bin-hadoop2.4
 $ SCALA_HOME=/usr/share/java MAVEN_OPTS=\"-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m\" mvn -Dhadoop.version=2.2.0 -Dprotobuf.version=2.5.0 -DskipTests clean package
 ```
 
@@ -49,7 +50,7 @@ sbin/start-all.sh
 
 We have added a script for setting up Spark and Hadoop on Google Compute Engine with the optimal settings for this framework. You can find it in the **gce** directory.
 
-Please refer to the [Spark official docs](http://spark.apache.org/docs/0.9.1/spark-standalone.html) for details on how to deploy Spark in standalone mode.
+Please refer to the [Spark official docs](https://spark.apache.org/docs/latest/spark-standalone.html) for details on how to deploy Spark in standalone mode.
 
 ## How to Build
 
@@ -80,7 +81,7 @@ In the root directory run the following commands
 
 Now perform parallel extractions on your Spark cluster:
 
-    $ ./run extraction extraction/src/test/resources/config.properties extraction/src/test/resources/disk-config.properties
+    $ ./run extraction extraction/src/test/resources/config.properties extraction/src/test/resources/dist-config.properties
 
 
 ### Testing

diff --git a/common/pom.xml b/common/pom.xml
@@ -7,12 +7,12 @@
     <parent>
         <groupId>org.dbpedia</groupId>
         <artifactId>distributed-extraction</artifactId>
-        <version>4.0-SNAPSHOT</version>
+        <version>4.1-SNAPSHOT</version>
     </parent>
 
     <groupId>org.dbpedia.distributed-extraction</groupId>
     <artifactId>common</artifactId>
-    <version>4.0-SNAPSHOT</version>
+    <version>4.1-SNAPSHOT</version>
     <name>DBpedia Distributed Extraction Framework Commons</name>
 
     <build>
@@ -28,7 +28,7 @@
         <dependency>
             <groupId>org.dbpedia.extraction</groupId>
             <artifactId>core</artifactId>
-            <version>4.0-SNAPSHOT</version>
+            <version>4.1-SNAPSHOT</version>
         </dependency>
 
         <dependency>

diff --git a/common/src/main/scala/org/dbpedia/extraction/util/RichHadoopPath.scala b/common/src/main/scala/org/dbpedia/extraction/util/RichHadoopPath.scala
@@ -1,9 +1,11 @@
 package org.dbpedia.extraction.util
 
-import java.io.{IOException, OutputStream, InputStream}
+import java.io.{IOException, FileNotFoundException,OutputStream, InputStream}
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.conf.Configuration
 import java.nio.file.NotDirectoryException
+import scala.language.implicitConversions
+//import org.dbpedia.extraction.util.FileLike
 
 object RichHadoopPath {
 

diff --git a/data/tmp.txt b/data/tmp.txt
@@ -0,0 +1,6 @@
+This file is here because an empty data/ directory cannot be added to github. And the zip files that are part of this directory should not be pushed to github as
+that won't make sense either. The only option was to either add contents of this directory to the .gitignore file so that the directory is pushed but not the
+contents or put this dummy file in here to satisfy the needs of the octocat. And this folder is mandatory to have as this is what specified in the template
+files and I don't want my template files to fail and the end user to panic. 
+
+Thus this file was created...
diff --git a/download/dependency-reduced-pom.xml b/download/dependency-reduced-pom.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <parent>
+    <artifactId>distributed-extraction</artifactId>
+    <groupId>org.dbpedia</groupId>
+    <version>4.1-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.dbpedia.distributed-extraction</groupId>
+  <artifactId>download</artifactId>
+  <name>DBpedia Distributed Dump Downloader</name>
+  <version>4.1-SNAPSHOT</version>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>2.2</version>
+        <executions>
+          <execution>
+            <id>downloads-jar</id>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <transformers>
+                <transformer />
+                <transformer>
+                  <resource>reference.conf</resource>
+                </transformer>
+                <transformer>
+                  <manifestEntries>
+                    <Main-Class>worker.Main</Main-Class>
+                  </manifestEntries>
+                </transformer>
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+        <configuration>
+          <filters>
+            <filter>
+              <artifact>*:*</artifact>
+              <excludes>
+                <exclude>META-INF/*.SF</exclude>
+                <exclude>META-INF/*.DSA</exclude>
+                <exclude>META-INF/*.RSA</exclude>
+              </excludes>
+            </filter>
+          </filters>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <version>3.1.6</version>
+        <configuration>
+          <scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
+          <scalaVersion>${scala.version}</scalaVersion>
+          <launchers>
+            <launcher>
+              <id>seq-download</id>
+              <mainClass>org.dbpedia.extraction.dump.download.Download</mainClass>
+            </launcher>
+            <launcher>
+              <id>download</id>
+              <mainClass>org.dbpedia.extraction.dump.download.DistDownload</mainClass>
+            </launcher>
+          </launchers>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>
+
diff --git a/download/pom.xml b/download/pom.xml
@@ -7,12 +7,12 @@
     <parent>
         <groupId>org.dbpedia</groupId>
         <artifactId>distributed-extraction</artifactId>
-        <version>4.0-SNAPSHOT</version>
+        <version>4.1-SNAPSHOT</version>
     </parent>
 
     <groupId>org.dbpedia.distributed-extraction</groupId>
     <artifactId>download</artifactId>
-    <version>4.0-SNAPSHOT</version>
+    <version>4.1-SNAPSHOT</version>
     <name>DBpedia Distributed Dump Downloader</name>
 
     <build>
@@ -66,8 +66,13 @@
             <plugin>
                 <groupId>net.alchim31.maven</groupId>
                 <artifactId>scala-maven-plugin</artifactId>
-
+                <version>3.1.6</version>
                 <configuration>
+                    <scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
+                    <scalaVersion>${scala.version}</scalaVersion>
+                <!--</configuration>-->
+
+                <!--<configuration>-->
                     <launchers>
                         <launcher>
                             <id>seq-download</id>
@@ -108,31 +113,44 @@
         <dependency>
             <groupId>org.dbpedia.extraction</groupId>
             <artifactId>dump</artifactId>
-            <version>4.0-SNAPSHOT</version>
+            <version>4.1-SNAPSHOT</version>
         </dependency>
 
         <dependency>
             <groupId>org.dbpedia.distributed-extraction</groupId>
             <artifactId>common</artifactId>
-            <version>4.0-SNAPSHOT</version>
+            <version>4.1-SNAPSHOT</version>
         </dependency>
 
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-client</artifactId>
             <version>${hadoop.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>javax.servlet</groupId>
+                    <artifactId>servlet-api</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
+
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <version>${hadoop.version}</version>
         </dependency>
 
         <dependency>
             <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-contrib_2.10</artifactId>
-            <version>2.3.0</version>
+            <artifactId>akka-contrib_2.11</artifactId>
+            <version>2.3.11</version>
         </dependency>
 
         <dependency>
             <groupId>com.typesafe.akka</groupId>
-            <artifactId>akka-testkit_2.10</artifactId>
-            <version>2.3.0</version>
+            <artifactId>akka-testkit_2.11</artifactId>
+            <version>2.3.11</version>
         </dependency>
 
         <dependency>
@@ -141,4 +159,4 @@
             <version>0.1.51</version>
         </dependency>
     </dependencies>
-</project>
+</project>
diff --git a/download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownload.scala b/download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownload.scala
@@ -45,7 +45,9 @@ object DistDownload extends RemoteExecute
 
       for (host <- config.slaves)
       {
+        println("Host: "+host)
         val session = createSession(config.userName, host)
+        println("Session created")
         for (worker <- 1 to config.workersPerSlave)
         {
           val command = """cd %s/download;mkdir -p ../logs;nohup ../run download join=%s %s > ../logs/%s-%d.out &""".

diff --git a/download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownloadConfig.scala b/download/src/main/scala/org/dbpedia/extraction/dump/download/DistDownloadConfig.scala
@@ -94,7 +94,7 @@ class DistDownloadConfig(args: TraversableOnce[String]) extends HadoopConfigurab
    * Local temporary directory on worker nodes. Each dump file/chunk is downloaded to this directory before being moved to
    * the configured Hadoop file system.
    */
-  var localTempDir: File = new File("/tmp")
+  var localTempDir: File = new File("/data/")
 
   /**
    * Slave hostnames. By default consists only of 127.0.0.1.
@@ -138,18 +138,24 @@ class DistDownloadConfig(args: TraversableOnce[String]) extends HadoopConfigurab
   override protected val (hadoopCoreConf, hadoopHdfsConf, hadoopMapredConf) =
     parseHadoopConfigs(null, args)
 
+
   parse(null, args) // parse the distributed download config file/variables
 
   if (homeDir == null)
     throw Usage("Config variable extraction-framework-home not specified!")
 
+  println("parse dist-config file done")
+
   downloadConfig.parse(null, generalArgs.toList) // parse the general config file
 
+  println("config files parsed")
   if ((languages.nonEmpty || ranges.nonEmpty) && baseUrl == null) throw Usage("No base URL")
   if (languages.isEmpty && ranges.isEmpty) throw Usage("No files to download")
 
+  println("base url and files to download found")
   // First checks the Path obtained from distributed download config, then the general download config file if the former is null
-  baseDir = checkPathExists(Option(if (baseDir != null)
+  baseDir = checkPathExists(Option(
+                                   if (baseDir != null)
                                    {
                                      baseDir
                                    }
@@ -175,6 +181,7 @@ class DistDownloadConfig(args: TraversableOnce[String]) extends HadoopConfigurab
   def parse(dir: File, args: TraversableOnce[String])
   {
     // Parse the distributed config variables and accumulate the remaining variables in the generalArgs list.
+
     for (a <- args; arg = a.trim) arg match
     {
       case Ignored(_) => // ignore comments
@@ -243,7 +250,7 @@ class DistDownloadConfig(args: TraversableOnce[String]) extends HadoopConfigurab
   }
 }
 
-object Usage
+object exUsage
 {
   def apply(msg: String, arg: String = null, cause: Throwable = null): Exception =
   {

diff --git a/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Master.scala b/download/src/main/scala/org/dbpedia/extraction/dump/download/actors/Master.scala
@@ -97,10 +97,10 @@ class Master(workTimeout: FiniteDuration, mirrors: Seq[URL], threadsPerMirror: I
       else
       {
         log.info("Worker registered: {}", workerId)
-        workers += (workerId -> WorkerState(sender, status = Idle))
+          workers += (workerId -> WorkerState(sender, status = Idle))
         if (pendingDownloads.nonEmpty)
           sender ! DownloadIsReady
-      }
+       }
 
     case WorkerRequestsDownload(workerId) =>
       if (pendingDownloads.nonEmpty)

diff --git a/download/src/main/scala/org/dbpedia/extraction/util/RemoteExecute.scala b/download/src/main/scala/org/dbpedia/extraction/util/RemoteExecute.scala
@@ -2,6 +2,7 @@ package org.dbpedia.extraction.util
 
 import com.jcraft.jsch.{JSch, JSchException, ChannelExec, Session}
 import java.io.IOException
+import scala.io.Source
 
 /**
  * Utility trait for creating an SSH session and executing remote commands.
@@ -14,13 +15,19 @@ trait RemoteExecute
 
   def addIdentity(privateKeyPath: String) = jsch.addIdentity(privateKeyPath)
 
+
   def createSession(userName: String, host: String): Session =
   {
+    println("User name: "+userName)
+
     val session = jsch.getSession(userName, host)
     session.setConfig("UserKnownHostsFile", "/dev/null")
     session.setConfig("CheckHostIP", "no")
     session.setConfig("StrictHostKeyChecking", "no")
+
+
     session.connect()
+    println("Connected")
     session
   }
 

diff --git a/download/src/test/resources/dist-download.properties b/download/src/test/resources/dist-download.properties
@@ -1,11 +1,11 @@
 # NOTE: format is not java.util.Properties, but org.dbpedia.extraction.dump.download.DownloadConfig
 
-#distconfig=/example/path/file.cfg
+#distconfig=/home/gonephishing/dbpedia-extraction/distributed-extraction-framework/download/src/test/resources/dist-download.properties
 #   Path to existing distributed download configuration text file (UTF-8) whose lines contain arguments
 #   in the format given here. Absolute or relative path. File paths in that config file will be interpreted
 #   relative to the config file.
 
-#extraction-framework-home=/path/to/distributed-extraction-framework
+extraction-framework-home=/home/gonephishing/dbpedia-extraction/distributed-extraction-framework/
 #   This must be set to the absolute path to the distributed extraction framework (containing this module)
 #   in all nodes. No default value is set.
 
@@ -34,11 +34,11 @@ max-duplicate-progress-reports=30
 #  to 30 (not recommended to go below that), the worker will declare a job as failed only after getting the same progress
 #  report for 30 times. By default set to 30.
 
-local-temp-dir=/tmp
+local-temp-dir=data/
 #  Local temporary directory on worker nodes. Each dump file/chunk is downloaded to this directory before being moved to
 #  the configured Hadoop file system. This is /tmp by default.
 
-#private-key=/path/to/id_rsa
+private-key=~/.ssh/id_rsa
 #  Optional identity file to connect to cluster nodes via SSH.
 
 #ssh-passphrase=passphrase
@@ -65,7 +65,7 @@ master=127.0.0.1
 slaves=127.0.0.1
 #  List of comma-separated slave hosts. Example: slaves=node1,node2,node3
 
-base-dir=/tmp/basedir
+#base-dir=data/
 #  Replace by your target folder. If this is omitted here, it is read from the general configuration file if there is any.
 
 #join=akka.tcp://Workers@hostname:port