Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
essiembre committed Dec 14, 2016
2 parents 99d9094 + f04df31 commit 1188f02
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 3 deletions.
7 changes: 7 additions & 0 deletions norconex-collector-filesystem/TODO.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
The following are either things to be done, or ideas to consider:

- Wire in SMB/JCIFS dependencies.

- Add Commons VFS third party dependencies for ALL supported file systems.

- Add Commons VFS config options for ALL supported file systems.
54 changes: 51 additions & 3 deletions norconex-collector-filesystem/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.norconex.collectors</groupId>
<artifactId>norconex-collector-filesystem</artifactId>
<version>2.6.0</version>
<version>2.6.1</version>
<name>Norconex Filesystem Collector</name>

<properties>
Expand Down Expand Up @@ -60,7 +60,7 @@
<dependency>
<groupId>com.norconex.collectors</groupId>
<artifactId>norconex-collector-core</artifactId>
<version>1.6.0</version>
<version>1.7.0</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
Expand All @@ -72,14 +72,36 @@
<artifactId>commons-vfs2</artifactId>
<version>2.1</version>
</dependency>

<!-- Following deps are required for FTP protocol (and probably others), see:
http://stackoverflow.com/questions/30600843/unable-to-resolve-a-dependency-on-urlutil-conflicting-requirements
http://stackoverflow.com/questions/12730984/noclassdeffound-error-on-ftp-client-org-apache-commons-net-ftp-ftpclient
-->
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.5</version>
</dependency>


</dependencies>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.norconex.commons</groupId>
<artifactId>norconex-commons-lang</artifactId>
<version>1.11.0</version>
<version>1.12.3</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.10</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
Expand All @@ -91,6 +113,32 @@
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.5</version>
</dependency>
<!-- Upgraded from 5.2 (Importer/Tika dep) to avoid exception:
Caused by: java.lang.NoSuchMethodError:
jj2000.j2k.fileformat.reader.FileFormatReader.<init>(
Ljj2000/j2k/io/RandomAccessIO;Lcom/sun/media/imageioimpl/plugins/
jpeg2000/J2KMetadata;)V
-->
<dependency>
<groupId>edu.ucar</groupId>
<artifactId>jj2000</artifactId>
<version>5.3</version>
</dependency>
</dependencies>
</dependencyManagement>

Expand Down
13 changes: 13 additions & 0 deletions norconex-collector-filesystem/src/changes/changes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@
</properties>
<body>

<release version="2.6.1" date="2016-12-14" description="Maintenance release">
<action dev="essiembre" type="update">
Dependency updates: Norconex Commons Lang 1.12.3, JJ2000 5.3,
Norconex Collection Core 1.7.0, Apache HTTP Client 4.5.2,
Apache HTTP Core 4.4.5, Apache Commons Codec 1.10,
Apache Commons Net 3.5, Apache HttpClient 3.1.
</action>
<action dev="essiembre" type="fix" issue="11">
Fixed FTP file system. Added thrid-party dependencies and FTP
configuration required for FTP file system to work.
</action>
</release>

<release version="2.6.0" date="2016-08-25" description="Feature release">
<action dev="essiembre" type="update">
Dependency updates: Norconex Collector Core 1.6.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.commons.vfs2.FileSystemManager;
import org.apache.commons.vfs2.FileSystemOptions;
import org.apache.commons.vfs2.VFS;
import org.apache.commons.vfs2.provider.ftp.FtpFileSystemConfigBuilder;

import com.norconex.collector.core.CollectorException;
import com.norconex.collector.core.crawler.AbstractCrawler;
Expand Down Expand Up @@ -79,7 +81,21 @@ protected void prepareExecution(
JobStatusUpdater statusUpdater, JobSuite suite,
ICrawlDataStore crawlDataStore, boolean resume) {



try {
FileSystemOptions opts = new FileSystemOptions();

// For FTP, these tweaks are required to get directory listings.
// More info:
//http://stackoverflow.com/questions/6046220/
// apache-commons-vfs-working-with-ftp
//https://commons.apache.org/proper/commons-vfs/filesystems.html#FTP
FtpFileSystemConfigBuilder ftpConfigBuilder =
FtpFileSystemConfigBuilder.getInstance();
ftpConfigBuilder.setPassiveMode(opts, true);
ftpConfigBuilder.setUserDirIsRoot(opts, false);

this.fileManager = VFS.getManager();
} catch (FileSystemException e) {
throw new FilesystemCollectorException(e);
Expand Down
21 changes: 21 additions & 0 deletions norconex-collector-filesystem/src/main/scripts/collector-fs.bat
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,26 @@
cd %~dp0
set ROOT_DIR=%~dp0

REM Third-party libraries sometimes have to create and write to temporary files.
REM By default those are created in your system "temp" folder
REM (usually defined under %TEMP% variable in Windows).
REM To change the temporary location those libraries will use, add the
REM following to the java command below (replacing the path):
REM
REM -Djava.io.tmpdir="C:\temp"

REM If you are experiencing memory problems or simply wish to increase crawling
REM performance you can specify the amount of memory allocated by increasing
REM the Java heap space. You can do so by adding the following to the Java
REM command below (using 2G as an example):
REM
REM -Xmx2G

REM For advanced users, JMX monitoring can be enabled by adding the following
REM to the java command below:
REM
REM -DenableJMX=true


java -Dlog4j.configuration="file:///%ROOT_DIR%log4j.properties" -Dfile.encoding=UTF8 -cp "./lib/*;./classes" com.norconex.collector.fs.FilesystemCollector %*

13 changes: 13 additions & 0 deletions norconex-collector-filesystem/src/main/scripts/collector-fs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,17 @@ export ROOT_DIR=$(realpath $(dirname $0))
#
# -Djava.io.tmpdir=/path/to/tmp

# If you are experiencing memory problems or simply wish to increase crawling
# performance you can specify the amount of memory allocated by increasing
# the Java heap space. You can do so by adding the following to the Java
# command below (using 2G as an example):
#
# -Xmx2G

# For advanced users, JMX monitoring can be enabled by adding the following
# to the java command below:
#
# -DenableJMX=true


java -Dlog4j.configuration="file:${ROOT_DIR}/log4j.properties" -Dfile.encoding=UTF8 -cp "./lib/*:./classes" com.norconex.collector.fs.FilesystemCollector "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -232,4 +232,29 @@
</crawler>
</crawlers>


<!-- === FOR ADVANCED USE ONLY ===============================================
The following listeners are made to work with the JEF API
(https://www.norconex.com/jef/api/). Usage is recommended only
to programmers familiar with the JEF API.
Most users should ignore these. -->

<!-- Listen for JEF job events. The class must implement
com.norconex.jef4.job.IJobLifeCycleListener -->
<jobLifeCycleListeners>
<listener class="YourClass" />
</jobLifeCycleListeners>

<!-- Listen for JEF errors. The class must implement
com.norconex.jef4.job.IJobErrorListener -->
<jobErrorListeners>
<listener class="YourClass" />
</jobErrorListeners>

<!-- Listen for JEF job events. The class must implement
com.norconex.jef4.suite.ISuiteLifeCycleListener -->
<suiteLifeCycleListeners>
<listener class="YourClass" />
</suiteLifeCycleListeners>

</fscollector>

0 comments on commit 1188f02

Please sign in to comment.