diff --git a/pom.xml b/pom.xml index 552f4ceb5a..590ab055a8 100644 --- a/pom.xml +++ b/pom.xml @@ -38,7 +38,6 @@ tika-parent tika-bom tika-core - tika-serialization tika-parsers tika-bundles tika-xmp @@ -114,6 +113,14 @@ + + + + + + + + diff --git a/tika-core/pom.xml b/tika-core/pom.xml index 7e163c061c..f5a9483c80 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -19,7 +19,8 @@ under the License. --> - + 4.0.0 @@ -40,6 +41,12 @@ org.slf4j slf4j-api + + org.pf4j + pf4j + + provided + commons-io commons-io @@ -255,9 +262,9 @@ -Xmx256m 240000 max - true - - + true + + org.apache.maven.plugins maven-project-info-reports-plugin diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java index 3e009f6665..3d4942c1b1 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java @@ -25,7 +25,6 @@ import org.apache.tika.io.FilenameUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.utils.StringUtils; public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDocumentBytesHandler { diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java index cf6441b4fb..8ac983fd77 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java @@ -25,7 +25,6 @@ import org.apache.commons.io.input.UnsynchronizedBufferedInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; /** * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesConfig.java similarity index 99% rename from tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java rename to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesConfig.java index 542c1c8a30..df7980418d 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesConfig.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.extractor; +package org.apache.tika.extractor; import java.io.Serializable; import java.util.Objects; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java deleted file mode 100644 index 40121f9a7e..0000000000 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.fetcher; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -import org.apache.tika.config.ConfigBase; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; - -/** - * Utility class to hold multiple fetchers. - *

- * This forbids multiple fetchers supporting the same name. - */ -public class FetcherManager extends ConfigBase { - - public static FetcherManager load(Path p) throws IOException, TikaConfigException { - try (InputStream is = - Files.newInputStream(p)) { - return FetcherManager.buildComposite("fetchers", FetcherManager.class, - "fetcher", Fetcher.class, is); - } - } - private final Map fetcherMap = new ConcurrentHashMap<>(); - - public FetcherManager(List fetchers) throws TikaConfigException { - for (Fetcher fetcher : fetchers) { - String name = fetcher.getName(); - if (name == null || name.trim().length() == 0) { - throw new TikaConfigException("fetcher name must not be blank"); - } - if (fetcherMap.containsKey(fetcher.getName())) { - throw new TikaConfigException( - "Multiple fetchers cannot support the same prefix: " + fetcher.getName()); - } - fetcherMap.put(fetcher.getName(), fetcher); - } - } - - public Fetcher getFetcher(String fetcherName) throws IOException, TikaException { - Fetcher fetcher = fetcherMap.get(fetcherName); - if (fetcher == null) { - throw new IllegalArgumentException( - "Can't find fetcher for fetcherName: " + fetcherName + ". I've loaded: " + - fetcherMap.keySet()); - } - return fetcher; - } - - public Set getSupported() { - return fetcherMap.keySet(); - } - - /** - * Convenience method that returns a fetcher if only one fetcher - * is specified in the tika-config file. If 0 or > 1 fetchers - * are specified, this throws an IllegalArgumentException. - * @return - */ - public Fetcher getFetcher() { - if (fetcherMap.size() == 0) { - throw new IllegalArgumentException("fetchers size must == 1 for the no arg call"); - } - if (fetcherMap.size() > 1) { - throw new IllegalArgumentException("need to specify 'fetcherName' if > 1 fetchers are" + - " available"); - } - for (Fetcher fetcher : fetcherMap.values()) { - return fetcher; - } - //this should be unreachable?! - throw new IllegalArgumentException("fetchers size must == 0"); - } -} diff --git a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java deleted file mode 100644 index 7e29ac20ad..0000000000 --- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.fetcher.fs; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.nio.file.InvalidPathException; -import java.nio.file.Path; -import java.nio.file.Paths; - -import org.junit.jupiter.api.Test; - -import org.apache.tika.config.InitializableProblemHandler; - - -public class FileSystemFetcherTest { - - @Test - public void testDescendant() throws Exception { - - Path root = Paths.get("/ab/cd/"); - Path descendant = root.resolve("ef/gh/ij.pdf"); - assertTrue(FileSystemFetcher.isDescendant(root, descendant)); - - descendant = Paths.get("/cd/ef.pdf"); - assertFalse(FileSystemFetcher.isDescendant(root, descendant)); - - descendant = root.resolve("../../ij.pdf"); - assertFalse(FileSystemFetcher.isDescendant(root, descendant)); - } - - @Test - public void testNullByte() throws Exception { - FileSystemFetcher f = new FileSystemFetcher(); - assertThrows(InvalidPathException.class, () -> { - f.setBasePath("bad\u0000path"); - f.setName("fs"); - f.checkInitialization(InitializableProblemHandler.IGNORE); - }); - } -} diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml index 2faa23ce74..023148c2c8 100644 --- a/tika-fuzzing/pom.xml +++ b/tika-fuzzing/pom.xml @@ -39,6 +39,11 @@ ${project.version} provided + + org.apache.tika + tika-pipes-core + ${project.version} + ${project.groupId} tika-serialization @@ -87,6 +92,12 @@ test-jar test + + + org.pf4j + pf4j + provided + @@ -133,4 +144,4 @@ - \ No newline at end of file + diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java index fb38c20f80..52834ac4aa 100644 --- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java +++ b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java @@ -77,7 +77,7 @@ private void execute(FuzzingCLIConfig config) throws Exception { ArrayBlockingQueue q = new ArrayBlockingQueue(10000); PipesConfig pipesConfig = PipesConfig.load(config.getTikaConfig()); - FetcherManager fetcherManager = FetcherManager.load(config.getTikaConfig()); + FetcherManager fetcherManager = new FetcherManager(); int totalThreads = pipesConfig.getNumClients() + 1; diff --git a/tika-grpc/pom.xml b/tika-grpc/pom.xml index c937d38795..99a8517095 100644 --- a/tika-grpc/pom.xml +++ b/tika-grpc/pom.xml @@ -41,6 +41,7 @@ 3.0.0 true + 3.12.0 @@ -222,10 +223,20 @@ tika-fetcher-http ${project.version} + + org.apache.tika + tika-fetcher-file-system + ${project.version} + com.fasterxml.jackson.module jackson-module-jsonSchema + + org.pf4j + pf4j + ${pf4j.version} + com.asarkar.grpc grpc-test @@ -335,68 +346,6 @@ org.apache.tika.pipes.grpc.TikaGrpcServer - - maven-shade-plugin - ${maven.shade.version} - - - package - - shade - - - - false - - - - - - - - *:* - - module-info.class - META-INF/maven/plugin.xml - META-INF/versions/9/module-info.class - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - META-INF/*.txt - META-INF/ASL2.0 - META-INF/DEPENDENCIES - META-INF/LICENSE - META-INF/NOTICE - META-INF/README - META-INF/MANIFEST.MF - LICENSE.txt - NOTICE.txt - CHANGES - README - builddef.lst - - - - - - org.apache.tika.pipes.grpc.TikaGrpcServer - - true - - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - META-INF/NOTICE - target/classes/META-INF/NOTICE - - - - - - org.apache.maven.plugins maven-checkstyle-plugin @@ -429,6 +378,42 @@ + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/dependencies + + + + + + maven-assembly-plugin + 3.7.0 + + + src/assembly/grpc-assembly.xml + + false + + + + make-assembly + package + + single + + + + diff --git a/tika-grpc/src/assembly/grpc-assembly.xml b/tika-grpc/src/assembly/grpc-assembly.xml new file mode 100644 index 0000000000..ee99848ed5 --- /dev/null +++ b/tika-grpc/src/assembly/grpc-assembly.xml @@ -0,0 +1,23 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/dependencies + / + + + ${project.build.directory} + / + + *.jar + + + + diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java index d21f11b08f..52afe4aaa6 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java @@ -29,14 +29,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.pipes.fetcher.AbstractFetcher; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; public class ExpiringFetcherStore implements AutoCloseable { private static final Logger LOG = LoggerFactory.getLogger(ExpiringFetcherStore.class); public static final long EXPIRE_JOB_INITIAL_DELAY = 1L; - private final Map fetchers = Collections.synchronizedMap(new HashMap<>()); - private final Map fetcherConfigs = Collections.synchronizedMap(new HashMap<>()); + private final Map fetcherConfigs = Collections.synchronizedMap(new HashMap<>()); private final Map fetcherLastAccessed = Collections.synchronizedMap(new HashMap<>()); private final ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor(); @@ -44,18 +42,18 @@ public class ExpiringFetcherStore implements AutoCloseable { public ExpiringFetcherStore(int expireAfterSeconds, int checkForExpiredFetchersDelaySeconds) { executorService.scheduleAtFixedRate(() -> { Set expired = new HashSet<>(); - for (String fetcherName : fetchers.keySet()) { - Instant lastAccessed = fetcherLastAccessed.get(fetcherName); + for (String fetcherId : fetcherConfigs.keySet()) { + Instant lastAccessed = fetcherLastAccessed.get(fetcherId); if (lastAccessed == null) { - LOG.error("Detected a fetcher with no last access time. FetcherName={}", fetcherName); - expired.add(fetcherName); + LOG.error("Detected a fetcher with no last access time. fetcherId={}", fetcherId); + expired.add(fetcherId); } else if (Instant .now() .isAfter(lastAccessed.plusSeconds(expireAfterSeconds))) { - LOG.info("Detected stale fetcher {} hasn't been accessed in {} seconds. " + "Deleting.", fetcherName, Instant + LOG.info("Detected stale fetcher {} hasn't been accessed in {} seconds. " + "Deleting.", fetcherId, Instant .now() .getEpochSecond() - lastAccessed.getEpochSecond()); - expired.add(fetcherName); + expired.add(fetcherId); } } for (String expiredFetcherId : expired) { @@ -64,18 +62,13 @@ public ExpiringFetcherStore(int expireAfterSeconds, int checkForExpiredFetchersD }, EXPIRE_JOB_INITIAL_DELAY, checkForExpiredFetchersDelaySeconds, TimeUnit.SECONDS); } - public boolean deleteFetcher(String fetcherName) { - boolean success = fetchers.remove(fetcherName) != null; - fetcherConfigs.remove(fetcherName); - fetcherLastAccessed.remove(fetcherName); + public boolean deleteFetcher(String fetcherId) { + boolean success = fetcherConfigs.remove(fetcherId) != null; + fetcherLastAccessed.remove(fetcherId); return success; } - public Map getFetchers() { - return fetchers; - } - - public Map getFetcherConfigs() { + public Map getFetcherConfigs() { return fetcherConfigs; } @@ -83,15 +76,15 @@ public Map getFetcherConfigs() { * This method will get the fetcher, but will also log the access the fetcher as having * been accessed. This prevents the scheduled job from removing the stale fetcher. */ - public T getFetcherAndLogAccess(String fetcherName) { - fetcherLastAccessed.put(fetcherName, Instant.now()); - return (T) fetchers.get(fetcherName); + public C getFetcherConfigAndLogAccess(String fetcherId) { + fetcherLastAccessed.put(fetcherId, Instant.now()); + return (C) fetcherConfigs.get(fetcherId); } - public void createFetcher(T fetcher, C config) { - fetchers.put(fetcher.getName(), fetcher); - fetcherConfigs.put(fetcher.getName(), config); - getFetcherAndLogAccess(fetcher.getName()); + public void createFetcher(String fetcherId, C config) { + config.setFetcherId(fetcherId); + fetcherConfigs.put(fetcherId, config); + getFetcherConfigAndLogAccess(fetcherId); } @Override diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java index 506522c740..5779445f5a 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java @@ -21,6 +21,8 @@ import java.io.File; import java.io.FileWriter; import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.List; import java.util.concurrent.TimeUnit; import com.beust.jcommander.JCommander; @@ -45,12 +47,16 @@ public class TikaGrpcServer { private static final Logger LOGGER = LoggerFactory.getLogger(TikaGrpcServer.class); public static final int TIKA_SERVER_GRPC_DEFAULT_PORT = 50052; private Server server; + // create the plugin manager @Parameter(names = {"-p", "--port"}, description = "The grpc server port", help = true) private Integer port = TIKA_SERVER_GRPC_DEFAULT_PORT; - @Parameter(names = {"-c", "--config"}, description = "The grpc server port", help = true) + @Parameter(names = {"-c", "--config"}, description = "The grpc server configuration XML file", help = true) private File tikaConfigXml; + @Parameter(names = {"-d", "--plugins-dir"}, description = "Tika pipes plugin root directories", help = true) + private List pluginDirs; + @Parameter(names = {"-s", "--secure"}, description = "Enable credentials required to access this grpc server") private boolean secure; @@ -99,7 +105,7 @@ public void start() throws Exception { healthStatusManager.setStatus(TikaGrpcServer.class.getSimpleName(), ServingStatus.SERVING); server = Grpc .newServerBuilderForPort(port, creds) - .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath())) + .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath(), pluginDirs)) .addService(healthStatusManager.getHealthService()) .addService(ProtoReflectionService.newInstance()) .build() diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index 4eb5f0b010..424bac77c2 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -17,28 +17,19 @@ package org.apache.tika.pipes.grpc; import java.io.File; -import java.io.FileWriter; import java.io.IOException; -import java.lang.reflect.InvocationTargetException; import java.nio.charset.StandardCharsets; +import java.nio.file.Path; import java.util.HashMap; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; -import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.module.jsonSchema.JsonSchema; import com.fasterxml.jackson.module.jsonSchema.JsonSchemaGenerator; import com.google.rpc.Status; import io.grpc.protobuf.StatusProto; @@ -47,8 +38,6 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Element; import org.xml.sax.SAXException; import org.apache.tika.DeleteFetcherReply; @@ -59,12 +48,13 @@ import org.apache.tika.GetFetcherConfigJsonSchemaRequest; import org.apache.tika.GetFetcherReply; import org.apache.tika.GetFetcherRequest; +import org.apache.tika.ListFetcherPluginsReply; +import org.apache.tika.ListFetcherPluginsRequest; import org.apache.tika.ListFetchersReply; import org.apache.tika.ListFetchersRequest; import org.apache.tika.SaveFetcherReply; import org.apache.tika.SaveFetcherRequest; import org.apache.tika.TikaGrpc; -import org.apache.tika.config.Initializable; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; @@ -74,10 +64,10 @@ import org.apache.tika.pipes.PipesConfig; import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.emitter.EmitKey; -import org.apache.tika.pipes.fetcher.AbstractFetcher; import org.apache.tika.pipes.fetcher.FetchKey; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; import org.apache.tika.pipes.fetcher.config.FetcherConfigContainer; +import org.apache.tika.pipes.grpc.exception.TikaGrpcException; class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerImpl.class); @@ -94,11 +84,11 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { PipesClient pipesClient; ExpiringFetcherStore expiringFetcherStore; + String tikaConfigPath; - TikaGrpcServerImpl(String tikaConfigPath) - throws TikaConfigException, IOException, ParserConfigurationException, - TransformerException, SAXException { + TikaGrpcServerImpl(String tikaConfigPath, List pluginDirs) throws TikaConfigException, IOException, + ParserConfigurationException, TransformerException, SAXException { File tikaConfigFile = new File(tikaConfigPath); if (!tikaConfigFile.canWrite()) { File tmpTikaConfigFile = File.createTempFile("configCopy", tikaConfigFile.getName()); @@ -109,69 +99,72 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { tikaConfigFile = tmpTikaConfigFile; tikaConfigPath = tikaConfigFile.getAbsolutePath(); } - pipesConfig = PipesConfig.load(tikaConfigFile.toPath()); + pipesConfig = PipesConfig.load(tikaConfigFile.toPath(), pluginDirs); pipesClient = new PipesClient(pipesConfig); expiringFetcherStore = new ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(), pipesConfig.getStaleFetcherDelaySeconds()); - this.tikaConfigPath = tikaConfigPath; - updateTikaConfig(); - } - - private void updateTikaConfig() - throws ParserConfigurationException, IOException, SAXException, TransformerException { - Document tikaConfigDoc = - DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath); - - Element fetchersElement = (Element) tikaConfigDoc.getElementsByTagName("fetchers").item(0); - if (fetchersElement == null) { - fetchersElement = tikaConfigDoc.createElement("fetchers"); - tikaConfigDoc.getDocumentElement().appendChild(fetchersElement); - } - for (int i = 0; i < fetchersElement.getChildNodes().getLength(); ++i) { - fetchersElement.removeChild(fetchersElement.getChildNodes().item(i)); - } - for (var fetcherEntry : expiringFetcherStore.getFetchers().entrySet()) { - AbstractFetcher fetcherObject = fetcherEntry.getValue(); - Map fetcherConfigParams = OBJECT_MAPPER.convertValue( - expiringFetcherStore.getFetcherConfigs().get(fetcherEntry.getKey()), - new TypeReference<>() { - }); - Element fetcher = tikaConfigDoc.createElement("fetcher"); - fetcher.setAttribute("class", fetcherEntry.getValue().getClass().getName()); - Element fetcherName = tikaConfigDoc.createElement("name"); - fetcherName.setTextContent(fetcherObject.getName()); - fetcher.appendChild(fetcherName); - populateFetcherConfigs(fetcherConfigParams, tikaConfigDoc, fetcher); - fetchersElement.appendChild(fetcher); - } - DOMSource source = new DOMSource(tikaConfigDoc); - FileWriter writer = new FileWriter(tikaConfigPath, StandardCharsets.UTF_8); - StreamResult result = new StreamResult(writer); - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); - transformer.transform(source, result); - } - - private void populateFetcherConfigs(Map fetcherConfigParams, - Document tikaConfigDoc, Element fetcher) { - for (var configParam : fetcherConfigParams.entrySet()) { - Element configElm = tikaConfigDoc.createElement(configParam.getKey()); - fetcher.appendChild(configElm); - if (configParam.getValue() instanceof List) { - List configParamVal = (List) configParam.getValue(); - String singularName = configParam.getKey().substring(0, configParam.getKey().length() - 1); - for (Object configParamObj : configParamVal) { - Element childElement = tikaConfigDoc.createElement(singularName); - childElement.setTextContent(Objects.toString(configParamObj)); - configElm.appendChild(childElement); - } - } else { - configElm.setTextContent(Objects.toString(configParam.getValue())); - } - } + this.tikaConfigPath = tikaConfigPath; } +// +// +// private void updateTikaConfig() +// throws ParserConfigurationException, IOException, SAXException, TransformerException { +// Document tikaConfigDoc = +// DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath); +// +// Element fetchersElement = (Element) tikaConfigDoc.getElementsByTagName("fetchers").item(0); +// if (fetchersElement == null) { +// fetchersElement = tikaConfigDoc.createElement("fetchers"); +// tikaConfigDoc.getDocumentElement().appendChild(fetchersElement); +// } +// for (int i = 0; i < fetchersElement.getChildNodes().getLength(); ++i) { +// fetchersElement.removeChild(fetchersElement.getChildNodes().item(i)); +// } +// for (var fetcherConfigEntry : expiringFetcherStore.getFetcherConfigs().entrySet()) { +// Fetcher fetcherObject = getFetcher(fetcherConfigEntry.getValue().getFetcherPluginId()); +// Map fetcherConfigParams = OBJECT_MAPPER.convertValue( +// expiringFetcherStore.getFetcherConfigs().get(fetcherConfigEntry.getKey()), +// new TypeReference<>() { +// }); +// Element fetcher = tikaConfigDoc.createElement("fetcher"); +// fetcher.setAttribute("class", fetcherConfigEntry.getValue().getClass().getName()); +// +// Element fetcherIdElm = tikaConfigDoc.createElement("fetcherId"); +// fetcherIdElm.setTextContent(fetcherObject.getPluginId()); +// fetcher.appendChild(fetcherIdElm); +// +// populateFetcherConfigs(fetcherConfigParams, tikaConfigDoc, fetcher); +// fetchersElement.appendChild(fetcher); +// } +// DOMSource source = new DOMSource(tikaConfigDoc); +// FileWriter writer = new FileWriter(tikaConfigPath, StandardCharsets.UTF_8); +// StreamResult result = new StreamResult(writer); +// +// TransformerFactory transformerFactory = TransformerFactory.newInstance(); +// Transformer transformer = transformerFactory.newTransformer(); +// transformer.transform(source, result); +// } +// +// private void populateFetcherConfigs(Map fetcherConfigParams, +// Document tikaConfigDoc, Element fetcher) { +// for (var configParam : fetcherConfigParams.entrySet()) { +// Element configElm = tikaConfigDoc.createElement(configParam.getKey()); +// fetcher.appendChild(configElm); +// if (configParam.getValue() instanceof List) { +// List configParamVal = (List) configParam.getValue(); +// String singularName = configParam.getKey().substring(0, configParam.getKey().length() - 1); +// for (Object configParamObj : configParamVal) { +// Element childElement = tikaConfigDoc.createElement(singularName); +// childElement.setTextContent(Objects.toString(configParamObj)); +// configElm.appendChild(childElement); +// } +// } else { +// configElm.setTextContent(Objects.toString(configParam.getValue())); +// } +// } +// } @Override public void fetchAndParseServerSideStreaming(FetchAndParseRequest request, @@ -210,10 +203,10 @@ public void fetchAndParse(FetchAndParseRequest request, private void fetchAndParseImpl(FetchAndParseRequest request, StreamObserver responseObserver) { - AbstractFetcher fetcher = - expiringFetcherStore.getFetcherAndLogAccess(request.getFetcherId()); - if (fetcher == null) { - throw new RuntimeException( + FetcherConfig fetcherConfig = + expiringFetcherStore.getFetcherConfigAndLogAccess(request.getFetcherId()); + if (fetcherConfig == null) { + throw new TikaGrpcException( "Could not find fetcher with name " + request.getFetcherId()); } Metadata tikaMetadata = new Metadata(); @@ -222,16 +215,16 @@ private void fetchAndParseImpl(FetchAndParseRequest request, String additionalFetchConfigJson = request.getAdditionalFetchConfigJson(); if (StringUtils.isNotBlank(additionalFetchConfigJson)) { // The fetch and parse has the option to specify additional configuration - AbstractConfig abstractConfig = expiringFetcherStore + FetcherConfig abstractFetcherConfig = expiringFetcherStore .getFetcherConfigs() - .get(fetcher.getName()); + .get(request.getFetcherId()); parseContext.set(FetcherConfigContainer.class, new FetcherConfigContainer() - .setConfigClassName(abstractConfig + .setConfigClassName(abstractFetcherConfig .getClass().getName()) .setJson(additionalFetchConfigJson)); } PipesResult pipesResult = pipesClient.process(new FetchEmitTuple(request.getFetchKey(), - new FetchKey(fetcher.getName(), request.getFetchKey()), new EmitKey(), tikaMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + new FetchKey(request.getFetcherId(), request.getFetchKey()), new EmitKey(), tikaMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); FetchAndParseReply.Builder fetchReplyBuilder = FetchAndParseReply.newBuilder() .setFetchKey(request.getFetchKey()) @@ -251,7 +244,7 @@ private void fetchAndParseImpl(FetchAndParseRequest request, } responseObserver.onNext(fetchReplyBuilder.build()); } catch (IOException e) { - throw new RuntimeException(e); + throw new TikaGrpcException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } @@ -261,50 +254,47 @@ private void fetchAndParseImpl(FetchAndParseRequest request, @Override public void saveFetcher(SaveFetcherRequest request, StreamObserver responseObserver) { - SaveFetcherReply reply = - SaveFetcherReply.newBuilder().setFetcherId(request.getFetcherId()).build(); - try { - Map fetcherConfigMap = OBJECT_MAPPER.readValue(request.getFetcherConfigJson(), new TypeReference<>() {}); - Map tikaParamsMap = createTikaParamMap(fetcherConfigMap); - saveFetcher(request.getFetcherId(), request.getFetcherClass(), fetcherConfigMap, tikaParamsMap); - updateTikaConfig(); - } catch (Exception e) { - throw new RuntimeException(e); - } - responseObserver.onNext(reply); - responseObserver.onCompleted(); +// SaveFetcherReply reply = +// SaveFetcherReply.newBuilder().setFetcherId(request.getFetcherId()).build(); +// try { +// Map fetcherConfigMap = OBJECT_MAPPER.readValue(request.getFetcherConfigJson(), new TypeReference<>() {}); +// Map tikaParamsMap = createTikaParamMap(fetcherConfigMap); +// saveFetcher(request.getFetcherId(), request.getPluginId(), fetcherConfigMap, tikaParamsMap); +// updateTikaConfig(); +// } catch (Exception e) { +// throw new TikaGrpcException(e); +// } +// responseObserver.onNext(reply); +// responseObserver.onCompleted(); } - private void saveFetcher(String name, String fetcherClassName, Map paramsMap, Map tikaParamsMap) { - try { - if (paramsMap == null) { - paramsMap = new LinkedHashMap<>(); - } - Class fetcherClass = - (Class) Class.forName(fetcherClassName); - String configClassName = - fetcherClass.getPackageName() + ".config." + fetcherClass.getSimpleName() + - "Config"; - Class configClass = - (Class) Class.forName(configClassName); - AbstractConfig configObject = OBJECT_MAPPER.convertValue(paramsMap, configClass); - AbstractFetcher abstractFetcher = - fetcherClass.getDeclaredConstructor(configClass).newInstance(configObject); - abstractFetcher.setName(name); - if (Initializable.class.isAssignableFrom(fetcherClass)) { - Initializable initializable = (Initializable) abstractFetcher; - initializable.initialize(tikaParamsMap); - } - if (expiringFetcherStore.deleteFetcher(name)) { - LOG.info("Updating fetcher {}", name); - } else { - LOG.info("Creating new fetcher {}", name); - } - expiringFetcherStore.createFetcher(abstractFetcher, configObject); - } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | - InvocationTargetException | NoSuchMethodException | TikaConfigException e) { - throw new RuntimeException(e); - } + private void saveFetcher(String fetcherId, String pluginId, Map paramsMap, Map tikaParamsMap) { +// try { +// if (paramsMap == null) { +// paramsMap = new LinkedHashMap<>(); +// } +// Fetcher fetcher = getFetcher(pluginId); +// Class fetcherClass = fetcher.getClass(); +// String configClassName = +// fetcherClass.getPackageName() + ".config." + fetcherClass.getSimpleName() + +// "Config"; +// +// Class configClass = +// (Class) Class.forName(configClassName, true, fetcher.getClass().getClassLoader()); +// FetcherConfig configObject = OBJECT_MAPPER.convertValue(paramsMap, configClass); +// if (Initializable.class.isAssignableFrom(fetcherClass)) { +// Initializable initializable = (Initializable) fetcher; +// initializable.initialize(tikaParamsMap); +// } +// if (expiringFetcherStore.deleteFetcher(fetcherId)) { +// LOG.info("Updating fetcher {}", fetcherId); +// } else { +// LOG.info("Creating new fetcher {}", fetcherId); +// } +// expiringFetcherStore.createFetcher(fetcherId, configObject); +// } catch (ClassNotFoundException | TikaConfigException e) { +// throw new TikaGrpcException("Could not create fetcher", e); +// } } private static Map createTikaParamMap(Map fetcherConfigMap) { @@ -328,16 +318,15 @@ static Status notFoundStatus(String fetcherId) { public void getFetcher(GetFetcherRequest request, StreamObserver responseObserver) { GetFetcherReply.Builder getFetcherReply = GetFetcherReply.newBuilder(); - AbstractConfig abstractConfig = + FetcherConfig fetcherConfig = expiringFetcherStore.getFetcherConfigs().get(request.getFetcherId()); - AbstractFetcher abstractFetcher = expiringFetcherStore.getFetchers().get(request.getFetcherId()); - if (abstractFetcher == null || abstractConfig == null) { + if (fetcherConfig == null) { responseObserver.onError(StatusProto.toStatusException(notFoundStatus(request.getFetcherId()))); return; } getFetcherReply.setFetcherId(request.getFetcherId()); - getFetcherReply.setFetcherClass(abstractFetcher.getClass().getName()); - Map paramMap = OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() {}); + getFetcherReply.setPluginId(fetcherConfig.getFetcherPluginId()); + Map paramMap = OBJECT_MAPPER.convertValue(fetcherConfig, new TypeReference<>() {}); paramMap.forEach( (k, v) -> getFetcherReply.putParams(Objects.toString(k), Objects.toString(v))); responseObserver.onNext(getFetcherReply.build()); @@ -348,8 +337,8 @@ public void getFetcher(GetFetcherRequest request, public void listFetchers(ListFetchersRequest request, StreamObserver responseObserver) { ListFetchersReply.Builder listFetchersReplyBuilder = ListFetchersReply.newBuilder(); - for (Map.Entry fetcherConfig : expiringFetcherStore.getFetcherConfigs() - .entrySet()) { + for (Map.Entry fetcherConfig : expiringFetcherStore.getFetcherConfigs() + .entrySet()) { GetFetcherReply.Builder replyBuilder = saveFetcherReply(fetcherConfig); listFetchersReplyBuilder.addGetFetcherReplies(replyBuilder.build()); } @@ -358,22 +347,19 @@ public void listFetchers(ListFetchersRequest request, } private GetFetcherReply.Builder saveFetcherReply( - Map.Entry fetcherConfig) { - AbstractFetcher abstractFetcher = - expiringFetcherStore.getFetchers().get(fetcherConfig.getKey()); - AbstractConfig abstractConfig = - expiringFetcherStore.getFetcherConfigs().get(fetcherConfig.getKey()); + Map.Entry fetcherConfigEntry) { + FetcherConfig fetcherConfig = fetcherConfigEntry.getValue(); GetFetcherReply.Builder replyBuilder = - GetFetcherReply.newBuilder().setFetcherClass(abstractFetcher.getClass().getName()) - .setFetcherId(abstractFetcher.getName()); - loadParamsIntoReply(abstractConfig, replyBuilder); + GetFetcherReply.newBuilder().setPluginId(fetcherConfig.getFetcherPluginId()) + .setFetcherId(fetcherConfig.getFetcherId()); + loadParamsIntoReply(fetcherConfig, replyBuilder); return replyBuilder; } - private static void loadParamsIntoReply(AbstractConfig abstractConfig, + private static void loadParamsIntoReply(FetcherConfig fetcherConfig, GetFetcherReply.Builder replyBuilder) { Map paramMap = - OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() { + OBJECT_MAPPER.convertValue(fetcherConfig, new TypeReference<>() { }); if (paramMap != null) { paramMap.forEach( @@ -386,11 +372,11 @@ public void deleteFetcher(DeleteFetcherRequest request, StreamObserver responseObserver) { boolean successfulDelete = deleteFetcher(request.getFetcherId()); if (successfulDelete) { - try { - updateTikaConfig(); - } catch (Exception e) { - throw new RuntimeException(e); - } +// try { +// updateTikaConfig(); +// } catch (Exception e) { +// throw new TikaGrpcException(e); +// } } responseObserver.onNext(DeleteFetcherReply.newBuilder().setSuccess(successfulDelete).build()); responseObserver.onCompleted(); @@ -398,15 +384,21 @@ public void deleteFetcher(DeleteFetcherRequest request, @Override public void getFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest request, StreamObserver responseObserver) { - GetFetcherConfigJsonSchemaReply.Builder builder = GetFetcherConfigJsonSchemaReply.newBuilder(); - try { - JsonSchema jsonSchema = JSON_SCHEMA_GENERATOR.generateSchema(Class.forName(request.getFetcherClass())); - builder.setFetcherConfigJsonSchema(OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema)); - } catch (ClassNotFoundException | JsonProcessingException e) { - throw new RuntimeException("Could not create json schema for " + request.getFetcherClass(), e); - } - responseObserver.onNext(builder.build()); - responseObserver.onCompleted(); +// GetFetcherConfigJsonSchemaReply.Builder builder = GetFetcherConfigJsonSchemaReply.newBuilder(); +// try { +// Fetcher fetcher = getFetcher(request.getPluginId()); +// JsonSchema jsonSchema = JSON_SCHEMA_GENERATOR.generateSchema(fetcher.getClass()); +// builder.setFetcherConfigJsonSchema(OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema)); +// } catch (JsonProcessingException e) { +// throw new TikaGrpcException("Could not create json schema for fetcher with plugin ID " + request.getPluginId(), e); +// } +// responseObserver.onNext(builder.build()); +// responseObserver.onCompleted(); + } + + @Override + public void listFetcherPlugins(ListFetcherPluginsRequest request, StreamObserver responseObserver) { + // todo } private boolean deleteFetcher(String fetcherName) { diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/exception/TikaGrpcException.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/exception/TikaGrpcException.java new file mode 100644 index 0000000000..21a95f1358 --- /dev/null +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/exception/TikaGrpcException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.grpc.exception; + +public class TikaGrpcException extends RuntimeException { + public TikaGrpcException(Throwable cause) { + super(cause); + } + + public TikaGrpcException(String message, Throwable cause) { + super(message, cause); + } + + public TikaGrpcException(String message) { + super(message); + } +} diff --git a/tika-grpc/src/main/proto/tika.proto b/tika-grpc/src/main/proto/tika.proto index 572ded7abd..671bdd04c3 100644 --- a/tika-grpc/src/main/proto/tika.proto +++ b/tika-grpc/src/main/proto/tika.proto @@ -59,6 +59,11 @@ service Tika { Get the Fetcher Config schema for a given fetcher class. */ rpc GetFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest) returns (GetFetcherConfigJsonSchemaReply) {} + /* + List fetcher plugins + */ + rpc ListFetcherPlugins(ListFetcherPluginsRequest) returns (ListFetcherPluginsReply) {} + } message SaveFetcherRequest { @@ -66,7 +71,7 @@ message SaveFetcherRequest { string fetcher_id = 1; // The full java class name of the fetcher class. List of // fetcher classes is found here: https://cwiki.apache.org/confluence/display/TIKA/tika-pipes - string fetcher_class = 2; + string plugin_id = 2; // JSON string of the fetcher config object. To see the json schema from which to build this json, // use the GetFetcherConfigJsonSchema rpc method. string fetcher_config_json = 3; @@ -117,7 +122,7 @@ message GetFetcherReply { // Echoes the ID of the fetcher being returned. string fetcher_id = 1; // The full Java class name of the Fetcher. - string fetcher_class = 2; + string plugin_id = 2; // The configuration parameters. map params = 3; } @@ -136,10 +141,17 @@ message ListFetchersReply { message GetFetcherConfigJsonSchemaRequest { // The full java class name of the fetcher config for which to fetch json schema. - string fetcher_class = 1; + string plugin_id = 1; } message GetFetcherConfigJsonSchemaReply { // The json schema that describes the fetcher config in string format. string fetcher_config_json_schema = 1; } + +message ListFetcherPluginsRequest { +} + +message ListFetcherPluginsReply { + string fetcher_plugin_id = 1; +} diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java index 264c366f38..2ae636478a 100644 --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java @@ -22,44 +22,55 @@ import java.time.Duration; import org.awaitility.Awaitility; +import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.AbstractFetcher; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; class ExpiringFetcherStoreTest { @Test void createFetcher() { try (ExpiringFetcherStore expiringFetcherStore = new ExpiringFetcherStore(1, 5)) { - AbstractFetcher fetcher = new AbstractFetcher() { - @Override - public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) { - return null; - } - }; - fetcher.setName("nick"); - AbstractConfig config = new AbstractConfig() { - }; - expiringFetcherStore.createFetcher(fetcher, config); + FetcherConfig config = getFetcherConfig(); + String fetcherId = "nicksFetcherId"; + expiringFetcherStore.createFetcher(fetcherId, config); Assertions.assertNotNull(expiringFetcherStore - .getFetchers() - .get(fetcher.getName())); + .getFetcherConfigs() + .get(fetcherId)); Awaitility .await() .atMost(Duration.ofSeconds(60)) .until(() -> expiringFetcherStore - .getFetchers() - .get(fetcher.getName()) == null); + .getFetcherConfigs() + .get(fetcherId) == null); assertNull(expiringFetcherStore .getFetcherConfigs() - .get(fetcher.getName())); + .get(fetcherId)); } } + + @NotNull + private static FetcherConfig getFetcherConfig() { + AbstractFetcher fetcher = new AbstractFetcher() { + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) { + return null; + } + }; + fetcher.setPluginId("nicksPlugin"); + return new FetcherConfig() { + @Override + public String getFetcherPluginId() { + return fetcher.getPluginId(); + } + }; + } } diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java index e78110abb1..cb4559bc58 100644 --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java @@ -55,7 +55,7 @@ import org.apache.tika.SaveFetcherReply; import org.apache.tika.SaveFetcherRequest; import org.apache.tika.TikaGrpc; -import org.apache.tika.pipes.fetcher.http.HttpFetcher; +import org.apache.tika.pipes.fetcher.http.config.HttpFetcherConfig; /** * This test will start an HTTP server using jetty. @@ -155,7 +155,7 @@ void createHttpFetcher() throws Exception { SaveFetcherRequest saveFetcherRequest = SaveFetcherRequest .newBuilder() .setFetcherId(httpFetcherId) - .setFetcherClass(HttpFetcher.class.getName()) + .setPluginId(HttpFetcherConfig.PLUGIN_ID) .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap .builder() .put("requestTimeout", 30_000) diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java index 80f391e33b..6572a201a6 100644 --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java @@ -23,6 +23,7 @@ import java.io.File; import java.nio.charset.StandardCharsets; +import java.nio.file.Path; import java.nio.file.Paths; import java.time.Duration; import java.time.LocalDateTime; @@ -66,6 +67,7 @@ import org.apache.tika.TikaGrpc; import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher; +import org.apache.tika.pipes.fetcher.fs.config.FileSystemFetcherConfig; @ExtendWith(GrpcCleanupExtension.class) public class TikaGrpcServerTest { @@ -84,6 +86,12 @@ static void init() throws Exception { } static final int NUM_FETCHERS_TO_CREATE = 10; + static List pluginDirs; + + @BeforeAll + static void loadPluginManager() { + pluginDirs = Collections.singletonList(Path.of("..", "tika-pipes", "tika-fetchers")); + } @Test public void testFetcherCrud(Resources resources) throws Exception { @@ -93,7 +101,7 @@ public void testFetcherCrud(Resources resources) throws Exception { Server server = InProcessServerBuilder .forName(serverName) .directExecutor() - .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())) + .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath(), pluginDirs)) .build() .start(); resources.register(server, Duration.ofSeconds(10)); @@ -112,7 +120,7 @@ public void testFetcherCrud(Resources resources) throws Exception { SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest .newBuilder() .setFetcherId(fetcherId) - .setFetcherClass(FileSystemFetcher.class.getName()) + .setPluginId(FileSystemFetcherConfig.PLUGIN_ID) .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap .builder() .put("basePath", targetFolder) @@ -127,7 +135,7 @@ public void testFetcherCrud(Resources resources) throws Exception { SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest .newBuilder() .setFetcherId(fetcherId) - .setFetcherClass(FileSystemFetcher.class.getName()) + .setPluginId(FileSystemFetcherConfig.PLUGIN_ID) .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap .builder() .put("basePath", targetFolder) @@ -152,7 +160,7 @@ public void testFetcherCrud(Resources resources) throws Exception { .setFetcherId(fetcherId) .build()); assertEquals(fetcherId, getFetcherReply.getFetcherId()); - assertEquals(FileSystemFetcher.class.getName(), getFetcherReply.getFetcherClass()); + assertEquals(FileSystemFetcherConfig.PLUGIN_ID, getFetcherReply.getPluginId()); } // delete fetchers @@ -188,7 +196,7 @@ public void testBiStream(Resources resources) throws Exception { Server server = InProcessServerBuilder .forName(serverName) .directExecutor() - .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())) + .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath(), pluginDirs)) .build() .start(); resources.register(server, Duration.ofSeconds(10)); @@ -206,7 +214,7 @@ public void testBiStream(Resources resources) throws Exception { SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest .newBuilder() .setFetcherId(fetcherId) - .setFetcherClass(FileSystemFetcher.class.getName()) + .setPluginId(FileSystemFetcherConfig.PLUGIN_ID) .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap .builder() .put("basePath", targetFolder) diff --git a/tika-grpc/src/test/resources/tika-pipes-test-config.xml b/tika-grpc/src/test/resources/tika-pipes-test-config.xml index e4006edb35..e7f4240c38 100644 --- a/tika-grpc/src/test/resources/tika-pipes-test-config.xml +++ b/tika-grpc/src/test/resources/tika-pipes-test-config.xml @@ -13,8 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---> - +--> 600 60 @@ -30,6 +29,5 @@ -1 - - - + + \ No newline at end of file diff --git a/tika-integration-tests/pom.xml b/tika-integration-tests/pom.xml index 5d641d852c..620243c894 100644 --- a/tika-integration-tests/pom.xml +++ b/tika-integration-tests/pom.xml @@ -58,6 +58,12 @@ junit-vintage-engine test + + + org.pf4j + pf4j + provided + diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java index b32304d69b..1019a1d99d 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java @@ -38,8 +38,12 @@ import com.amazonaws.services.s3.iterable.S3Objects; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -51,12 +55,26 @@ import org.apache.tika.pipes.fetcher.FetcherManager; import org.apache.tika.pipes.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.pipesiterator.PipesIterator; +import org.apache.tika.pipes.plugin.TikaPluginManager; @Disabled("turn these into actual tests with mock s3") public class PipeIntegrationTests { + private static final Logger LOG = LoggerFactory.getLogger(PipeIntegrationTests.class); private static final Path OUTDIR = Paths.get(""); + PluginManager pluginManager; + + @BeforeEach + void init() throws IOException { + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + } + @Test public void testBruteForce() throws Exception { String region = ""; @@ -146,7 +164,7 @@ public void testS3ToS3() throws Exception { } private Fetcher getFetcher(String fileName, String fetcherName) throws Exception { - FetcherManager manager = FetcherManager.load(getPath(fileName)); + FetcherManager manager = FetcherManager.load(pluginManager); return manager.getFetcher(fetcherName); } diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 5ec17e8bc4..e16d3a2a2e 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -448,6 +448,7 @@ 1.5.6-4 9.40 1.5.10 + 3.12.0 @@ -1038,6 +1039,11 @@ jspecify 1.0.0 + + org.pf4j + pf4j + ${pf4j.version} + diff --git a/tika-pipes/pom.xml b/tika-pipes/pom.xml index d3bce6a4e7..d0a06dd1f3 100644 --- a/tika-pipes/pom.xml +++ b/tika-pipes/pom.xml @@ -30,12 +30,14 @@ pom + tika-pipes-core tika-httpclient-commons tika-fetchers tika-emitters tika-pipes-iterators tika-pipes-reporters tika-async-cli + tika-serialization diff --git a/tika-pipes/tika-async-cli/pom.xml b/tika-pipes/tika-async-cli/pom.xml index 9cccbe9a2a..6d64b03ee9 100644 --- a/tika-pipes/tika-async-cli/pom.xml +++ b/tika-pipes/tika-async-cli/pom.xml @@ -44,6 +44,11 @@ test-jar test + + org.apache.tika + tika-pipes-core + ${project.version} + org.apache.logging.log4j diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java index acadeeb7af..9b6a53933a 100644 --- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java +++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java @@ -33,6 +33,7 @@ import org.junit.jupiter.api.io.TempDir; import org.apache.tika.TikaTest; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; @@ -40,7 +41,6 @@ import org.apache.tika.pipes.HandlerConfig; import org.apache.tika.pipes.async.AsyncProcessor; import org.apache.tika.pipes.emitter.EmitKey; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.pipesiterator.PipesIterator; import org.apache.tika.serialization.JsonMetadataList; diff --git a/tika-pipes/tika-emitters/pom.xml b/tika-pipes/tika-emitters/pom.xml index 6ae038c7a9..afa9d50a87 100644 --- a/tika-pipes/tika-emitters/pom.xml +++ b/tika-pipes/tika-emitters/pom.xml @@ -42,7 +42,16 @@ tika-emitter-jdbc + + + ${project.groupId} + tika-pipes-core + ${project.version} + provided + + + 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml b/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml index f34850a2a9..801ffaba01 100644 --- a/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml +++ b/tika-pipes/tika-emitters/tika-emitter-fs/pom.xml @@ -113,4 +113,4 @@ 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 23b9d73f17..b4637e0c20 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -17,7 +17,8 @@ specific language governing permissions and limitations under the License. --> - + org.apache.tika tika-pipes @@ -37,12 +38,81 @@ tika-fetcher-gcs tika-fetcher-az-blob tika-fetcher-microsoft-graph + tika-fetcher-file-system + tika-fetcher-url - + + org.pf4j + pf4j + + provided + + + org.apache.tika + tika-pipes-core + ${project.version} + + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + + + org.slf4j + jcl-over-slf4j + + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + + + org.slf4j + jcl-over-slf4j + + + commons-io + commons-io + + + ${project.groupId} + tika-core + ${project.version} + provided + + + ${project.groupId} + tika-core + ${project.version} + test-jar + test + + + ${project.groupId} + tika-serialization + ${project.version} + test + + + org.mockito + mockito-core + test + + + org.junit.jupiter + junit-jupiter-engine + test + 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml index f0d7642e52..b45912e824 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/pom.xml @@ -17,110 +17,67 @@ specific language governing permissions and limitations under the License. --> - - - tika-fetchers - org.apache.tika - 3.0.0-SNAPSHOT - - 4.0.0 + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + 4.0.0 - tika-fetcher-az-blob - Apache Tika Azure Blob fetcher + tika-fetcher-az-blob + Apache Tika Azure Blob fetcher - - - ${project.groupId} - tika-core - ${project.version} - provided - - - com.azure - azure-storage-blob - - - ${project.groupId} - tika-core - ${project.version} - test-jar - test - - - ${project.groupId} - tika-serialization - ${project.version} - test - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - org.apache.tika.pipes.fetcher.azblob - - - - - - - test-jar - - - - - - maven-shade-plugin - ${maven.shade.version} - - - package - - shade - - - - false - - - - - *:* - - META-INF/* - LICENSE.txt - NOTICE.txt - - - - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - META-INF/NOTICE - target/classes/META-INF/NOTICE - - - META-INF/DEPENDENCIES - target/classes/META-INF/DEPENDENCIES - - - - - - - - - + + + com.azure + azure-storage-blob + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + + + + + + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + + + make-assembly + package + + single + + + + + + 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java index 0dc05a2d59..13057f5795 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcher.java @@ -29,6 +29,7 @@ import com.azure.storage.blob.BlobServiceClient; import com.azure.storage.blob.BlobServiceClientBuilder; import com.azure.storage.blob.models.BlobProperties; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,6 +57,7 @@ * 2) If you have different endpoints or sas tokens or containers across * your requests, your fetchKey will be the complete SAS url pointing to the blob. */ +@Extension public class AZBlobFetcher extends AbstractFetcher implements Initializable { public AZBlobFetcher() { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcherPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcherPlugin.java new file mode 100644 index 0000000000..5ae15613ad --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/AZBlobFetcherPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.azblob; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class AZBlobFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(AZBlobFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java index 2bfe61fa79..5dc091ff23 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/java/org/apache/tika/pipes/fetcher/azblob/config/AZBlobFetcherConfig.java @@ -16,9 +16,17 @@ */ package org.apache.tika.pipes.fetcher.azblob.config; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; + +public class AZBlobFetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "az-blob-fetcher"; + + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } -public class AZBlobFetcherConfig extends AbstractConfig { private boolean spoolToTemp; private String sasToken; private String endpoint; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/resources/plugin.properties new file mode 100644 index 0000000000..74dfeaadb6 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=az-blob-fetcher +plugin.class=org.apache.tika.pipes.fetcher.azblob.AZBlobFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=Azure Blob Fetcher +plugin.description=Capable of taking Blob IDs from AZ and using their bytes as tika parse bytes. diff --git a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java index 1ba2cfdcd1..4527b8b0ef 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-az-blob/src/test/java/org/apache/tika/pipes/fetcher/azblob/TestAZBlobFetcher.java @@ -21,30 +21,39 @@ import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; -import java.nio.file.Paths; +import java.nio.file.Path; import java.util.List; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.plugin.TikaPluginManager; import org.apache.tika.serialization.JsonMetadataList; @Disabled("write actual unit tests") public class TestAZBlobFetcher extends TikaTest { + private static final Logger LOG = LoggerFactory.getLogger(TestAZBlobFetcher.class); private static final String FETCH_STRING = "something-or-other/test-out.json"; @Test public void testConfig() throws Exception { - FetcherManager fetcherManager = FetcherManager.load(Paths.get(this - .getClass() - .getResource("/tika-config-az-blob.xml") - .toURI())); + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); + Fetcher fetcher = fetcherManager.getFetcher("az-blob"); List metadataList = null; try (Reader reader = new BufferedReader(new InputStreamReader(fetcher.fetch(FETCH_STRING, new Metadata(), new ParseContext()), StandardCharsets.UTF_8))) { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-file-system/pom.xml new file mode 100644 index 0000000000..cf40e05587 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/pom.xml @@ -0,0 +1,81 @@ + + + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + 4.0.0 + + tika-fetcher-file-system + Apache Tika FS Fetcher + Apache Tika Pipes Fetcher for Local File System + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + + + + + + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + + + make-assembly + package + + single + + + + + + + + + 3.0.0-BETA2-rc1 + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java similarity index 99% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java rename to tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java index bc3c4cddd3..08ac378fee 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -27,6 +27,7 @@ import java.util.Date; import java.util.Map; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,6 +46,7 @@ import org.apache.tika.pipes.fetcher.AbstractFetcher; import org.apache.tika.pipes.fetcher.fs.config.FileSystemFetcherConfig; +@Extension public class FileSystemFetcher extends AbstractFetcher implements Initializable { public FileSystemFetcher() { } diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java new file mode 100644 index 0000000000..931aa10892 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.fs; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FileSystemFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java similarity index 83% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java rename to tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java index b9f155fbd7..10475d68e3 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/java/org/apache/tika/pipes/fetcher/fs/config/FileSystemFetcherConfig.java @@ -16,9 +16,16 @@ */ package org.apache.tika.pipes.fetcher.fs.config; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; -public class FileSystemFetcherConfig extends AbstractConfig { +public class FileSystemFetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "file-system-fetcher"; + + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } private String basePath; private boolean extractFileSystemMetadata; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/resources/plugin.properties new file mode 100644 index 0000000000..3e6f63af22 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-file-system/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=file-system-fetcher +plugin.class=org.apache.tika.pipes.fetcher.fs.FileSystemFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=Local File System Fetcher +plugin.description=Capable of fetching the local file system diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml index e3f5044d40..dd2a831c97 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/pom.xml @@ -17,97 +17,67 @@ specific language governing permissions and limitations under the License. --> - - - tika-fetchers - org.apache.tika - 3.0.0-SNAPSHOT - - 4.0.0 + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + 4.0.0 - tika-fetcher-gcs - Apache Tika Google Cloud Storage fetcher + tika-fetcher-gcs + Apache Tika Google Cloud Storage fetcher - - - ${project.groupId} - tika-core - ${project.version} - provided - - - com.google.cloud - google-cloud-storage - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - org.apache.tika.pipes.fetcher.gcs - - - - - - - test-jar - - - - - - maven-shade-plugin - ${maven.shade.version} - - - package - - shade - - - - false - - - - - *:* - - META-INF/* - LICENSE.txt - NOTICE.txt - - - - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - META-INF/NOTICE - target/classes/META-INF/NOTICE - - - META-INF/DEPENDENCIES - target/classes/META-INF/DEPENDENCIES - - - - - - - - - + + + com.google.cloud + google-cloud-storage + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + + + + + + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + + + make-assembly + package + + single + + + + + + 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java index 75f89527e8..271eedb68d 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcher.java @@ -27,6 +27,7 @@ import com.google.cloud.storage.BlobId; import com.google.cloud.storage.Storage; import com.google.cloud.storage.StorageOptions; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,6 +47,7 @@ /** * Fetches files from google cloud storage. Must set projectId and bucket via the config. */ +@Extension public class GCSFetcher extends AbstractFetcher implements Initializable { public GCSFetcher() { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcherPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcherPlugin.java new file mode 100644 index 0000000000..c90ebb140b --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/GCSFetcherPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.gcs; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class GCSFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(GCSFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java index a8dad6417d..bf934ae95d 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/java/org/apache/tika/pipes/fetcher/gcs/config/GCSFetcherConfig.java @@ -16,9 +16,16 @@ */ package org.apache.tika.pipes.fetcher.gcs.config; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; -public class GCSFetcherConfig extends AbstractConfig { +public class GCSFetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "gcs-fetcher"; + + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } private boolean spoolToTemp; private String projectId; private String bucket; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/resources/plugin.properties new file mode 100644 index 0000000000..79e5590e87 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=gcs-fetcher +plugin.class=org.apache.tika.pipes.fetcher.gcs.GCSFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=GCS Fetcher +plugin.description=GCS Fetchedr diff --git a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java index e685520507..87b087df7d 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-gcs/src/test/java/org/apache/tika/pipes/fetcher/s3/TestGCSFetcher.java @@ -21,21 +21,25 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.plugin.TikaPluginManager; @Disabled("write actual unit tests") public class TestGCSFetcher { + private static final Logger LOG = LoggerFactory.getLogger(TestGCSFetcher.class); private static final String FETCH_STRING = "testExtraSpaces.pdf"; @@ -48,11 +52,15 @@ public static void setUp() throws Exception { outputFile = Files.createTempFile(TEMP_DIR, "tika-test", ".pdf"); } - @Test public void testConfig() throws Exception { - FetcherManager fetcherManager = FetcherManager.load( - Paths.get(this.getClass().getResource("/tika-config-gcs.xml").toURI())); + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); Fetcher fetcher = fetcherManager.getFetcher("gcs"); Metadata metadata = new Metadata(); try (InputStream is = fetcher.fetch(FETCH_STRING, metadata, new ParseContext())) { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml index 320569ed53..d125595a3b 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/pom.xml @@ -17,7 +17,8 @@ specific language governing permissions and limitations under the License. --> - + tika-fetchers org.apache.tika @@ -29,17 +30,6 @@ Apache Tika http fetcher - - org.apache.logging.log4j - log4j-slf4j2-impl - provided - - - ${project.groupId} - tika-core - ${project.version} - provided - ${project.groupId} tika-httpclient-commons @@ -61,79 +51,44 @@ com.fasterxml.jackson.core jackson-annotations - - ${project.groupId} - tika-core - ${project.version} - test-jar - test - - - org.mockito - mockito-core - test - org.apache.maven.plugins - maven-jar-plugin - - - - org.apache.tika.pipes.fetcher.http - - - + maven-dependency-plugin + 3.6.1 + copy-dependencies + package - test-jar + copy-dependencies + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + - maven-shade-plugin - ${maven.shade.version} + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + make-assembly package - shade + single - - - false - - - - - *:* - - META-INF/* - LICENSE.txt - NOTICE.txt - - - - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - META-INF/NOTICE - target/classes/META-INF/NOTICE - - - META-INF/DEPENDENCIES - target/classes/META-INF/DEPENDENCIES - - - diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java index 893e6c77b7..f2ef38a430 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java @@ -58,6 +58,7 @@ import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.impl.conn.ConnectionShutdownException; import org.apache.http.util.EntityUtils; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -88,6 +89,7 @@ /** * Based on Apache httpclient */ +@Extension public class HttpFetcher extends AbstractFetcher implements Initializable, RangeFetcher { public HttpFetcher() { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcherPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcherPlugin.java new file mode 100644 index 0000000000..bd77c10268 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcherPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.http; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HttpFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(HttpFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java index 5274a65f9b..62446f4954 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/config/HttpFetcherConfig.java @@ -19,9 +19,16 @@ import java.util.ArrayList; import java.util.List; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; -public class HttpFetcherConfig extends AbstractConfig { +public class HttpFetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "http-fetcher"; + + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } private String userName; private String password; private String ntDomain; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/resources/plugin.properties new file mode 100644 index 0000000000..ecd3cb5123 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=http-fetcher +plugin.class=org.apache.tika.pipes.fetcher.http.HttpFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=HTTP Fetcher +plugin.description=HTTP web request fetcher diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java index c888db8ae1..159896546f 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java @@ -28,7 +28,6 @@ import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.security.SecureRandom; import java.util.ArrayList; @@ -60,6 +59,9 @@ import org.junit.jupiter.api.Test; import org.mockito.ArgumentCaptor; import org.mockito.Mockito; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.TikaTest; import org.apache.tika.client.HttpClientFactory; @@ -74,8 +76,10 @@ import org.apache.tika.pipes.fetcher.http.config.HttpFetcherConfig; import org.apache.tika.pipes.fetcher.http.config.HttpHeaders; import org.apache.tika.pipes.fetcher.http.jwt.JwtGenerator; +import org.apache.tika.pipes.plugin.TikaPluginManager; class HttpFetcherTest extends TikaTest { + private static final Logger LOG = LoggerFactory.getLogger(HttpFetcherTest.class); private static final String TEST_URL = "wontbecalled"; private static final String CONTENT = "request content"; @@ -266,9 +270,13 @@ public void testRange() throws Exception { } FetcherManager getFetcherManager(String path) throws Exception { - return FetcherManager.load(Paths.get(HttpFetcherTest.class - .getResource("/" + path) - .toURI())); + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + return FetcherManager.load(pluginManager); } private void mockClientResponse(final HttpResponse response) throws Exception { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml index 7b9e095f9a..d3c6eb6d22 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/pom.xml @@ -35,12 +35,13 @@ UTF-8 1.13.2 6.13.0 - 1.1.1 + 1.2.0 3.3.1 5.12.0 9.40 1.16.2 2.0.10 + 2.42.0 @@ -63,11 +64,6 @@ - - ${project.groupId} - tika-core - ${project.version} - com.microsoft.graph microsoft-graph @@ -103,14 +99,19 @@ ${kotlin-stdlib.version} - org.junit.jupiter - junit-jupiter-engine - test - - - org.mockito - mockito-core - test + com.google.cloud + google-cloud-storage + ${google-cloud-storage.version} + + + io.opentelemetry + opentelemetry-api + + + io.opentelemetry + opentelemetry-context + + org.mockito @@ -123,65 +124,41 @@ org.apache.maven.plugins - maven-jar-plugin - - - - org.apache.tika.pipes.fetcher.s3 - - - + maven-dependency-plugin + 3.6.1 + copy-dependencies + package - test-jar + copy-dependencies + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + - maven-shade-plugin - ${maven.shade.version} + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + make-assembly package - shade + single - - - false - - - - - *:* - - META-INF/* - LICENSE.txt - NOTICE.txt - - - - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - META-INF/NOTICE - target/classes/META-INF/NOTICE - - - META-INF/DEPENDENCIES - target/classes/META-INF/DEPENDENCIES - - - - diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java index 6871a70b6d..813b03a900 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphFetcher.java @@ -24,6 +24,7 @@ import com.azure.identity.ClientCertificateCredentialBuilder; import com.azure.identity.ClientSecretCredentialBuilder; import com.microsoft.graph.serviceclient.GraphServiceClient; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,6 +45,7 @@ * Fetches files from Microsoft Graph API. * Fetch keys are ${siteDriveId},${driveItemId} */ +@Extension public class MicrosoftGraphFetcher extends AbstractFetcher implements Initializable { private static final Logger LOGGER = LoggerFactory.getLogger(MicrosoftGraphFetcher.class); private GraphServiceClient graphClient; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphPlugin.java new file mode 100644 index 0000000000..541ba0f933 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/MicrosoftGraphPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.microsoftgraph; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class MicrosoftGraphPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(MicrosoftGraphPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java index 495f83ba6b..68981090e6 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/java/org/apache/tika/pipes/fetchers/microsoftgraph/config/MicrosoftGraphFetcherConfig.java @@ -19,9 +19,15 @@ import java.util.ArrayList; import java.util.List; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; -public class MicrosoftGraphFetcherConfig extends AbstractConfig { +public class MicrosoftGraphFetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "microsoft-graph-fetcher"; + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } private long[] throttleSeconds; private boolean spoolToTemp; private ClientSecretCredentialsConfig clientSecretCredentialsConfig; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/resources/plugin.properties new file mode 100644 index 0000000000..6d7e508e14 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-microsoft-graph/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=microsoft-graph-fetcher +plugin.class=org.apache.tika.pipes.fetchers.microsoftgraph.MicrosoftGraphPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=Microsoft Graph Fetcher +plugin.description=Uses the Microsoft Graph API to fetch data diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml index 8c06d00993..583e3394a3 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/pom.xml @@ -17,102 +17,71 @@ specific language governing permissions and limitations under the License. --> - - - tika-fetchers - org.apache.tika - 3.0.0-SNAPSHOT - - 4.0.0 + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + 4.0.0 - tika-fetcher-s3 - Apache Tika S3 fetcher + tika-fetcher-s3 + Apache Tika S3 fetcher - - - com.amazonaws - aws-java-sdk-s3 - - - org.apache.logging.log4j - log4j-slf4j2-impl - provided - - - ${project.groupId} - tika-core - ${project.version} - provided - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - org.apache.tika.pipes.fetcher.s3 - - - - - - - test-jar - - - - - - maven-shade-plugin - ${maven.shade.version} - - - package - - shade - - - - false - - - - - *:* - - META-INF/* - LICENSE.txt - NOTICE.txt - - - - - - META-INF/LICENSE - target/classes/META-INF/LICENSE - - - META-INF/NOTICE - target/classes/META-INF/NOTICE - - - META-INF/DEPENDENCIES - target/classes/META-INF/DEPENDENCIES - - - - - - - - - + + + com.amazonaws + aws-java-sdk-s3 + + + commons-io + commons-io + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + + + + + + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + + + make-assembly + package + + single + + + + + + 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java index ab4a139a0f..fd9030026e 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java @@ -41,6 +41,7 @@ import com.amazonaws.services.s3.model.AmazonS3Exception; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; +import org.pf4j.Extension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -66,6 +67,7 @@ * The bucket must be specified via the tika-config or before * initialization, and the fetch key is "path/to/my_file.pdf". */ +@Extension public class S3Fetcher extends AbstractFetcher implements Initializable, RangeFetcher { public S3Fetcher() { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3FetcherPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3FetcherPlugin.java new file mode 100644 index 0000000000..97676ca7d8 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3FetcherPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.s3; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class S3FetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(S3FetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java index 84a335a2bd..33918c2602 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/config/S3FetcherConfig.java @@ -16,9 +16,16 @@ */ package org.apache.tika.pipes.fetcher.s3.config; -import org.apache.tika.pipes.fetcher.config.AbstractConfig; +import org.apache.tika.pipes.fetcher.config.FetcherConfig; -public class S3FetcherConfig extends AbstractConfig { +public class S3FetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "s3-fetcher"; + + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } private boolean spoolToTemp; private String region; private String profile; diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/resources/plugin.properties new file mode 100644 index 0000000000..31bc1c52c5 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=s3-fetcher +plugin.class=org.apache.tika.pipes.fetcher.s3.S3FetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=S3 Fetcher +plugin.description=Capable of using amazon s3 sdk and fetching content. diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java index 0055bf68a8..d8e4922194 100644 --- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/test/java/org/apache/tika/pipes/fetcher/s3/TestS3Fetcher.java @@ -25,14 +25,20 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.plugin.TikaPluginManager; @Disabled("write actual unit tests") public class TestS3Fetcher { + private static final Logger LOG = LoggerFactory.getLogger(TestS3Fetcher.class); + private static final String FETCH_STRING = ""; private final Path outputFile = Paths.get(""); private final String region = "us-east-1"; @@ -53,8 +59,14 @@ public void testBasic() throws Exception { @Test public void testConfig() throws Exception { - FetcherManager fetcherManager = FetcherManager.load( - Paths.get(this.getClass().getResource("/tika-config-s3.xml").toURI())); + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); + Fetcher fetcher = fetcherManager.getFetcher("s3"); Metadata metadata = new Metadata(); try (InputStream is = fetcher.fetch(FETCH_STRING, metadata, new ParseContext())) { diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-url/pom.xml new file mode 100644 index 0000000000..a446c9e9b4 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/pom.xml @@ -0,0 +1,77 @@ + + + + + tika-fetchers + org.apache.tika + 3.0.0-SNAPSHOT + + 4.0.0 + + tika-fetcher-url + Apache Tika URL Fetcher + Apache Tika Pipes Fetcher for HTTP URLs + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.1 + + + copy-dependencies + package + + copy-dependencies + + + ${project.build.directory}/lib + compile + tika-core,tika-pipes-core + + + + + + maven-assembly-plugin + + + src/main/assembly/assembly.xml + + false + + + + make-assembly + package + + single + + + + + + + + 3.0.0-BETA2-rc1 + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/assembly/assembly.xml b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/assembly/assembly.xml new file mode 100644 index 0000000000..d614dfc367 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/assembly/assembly.xml @@ -0,0 +1,30 @@ + + dependencies-zip + + zip + + false + + + ${project.build.directory}/lib + /lib + + + ${project.build.directory} + /lib + + ${project.artifactId}-${project.version}.jar + + + + ${project.basedir}/src/main/resources + / + + plugin.properties + + + + diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java rename to tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java index 7692516cd0..9421d702b9 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java @@ -21,6 +21,8 @@ import java.net.URL; import java.util.Locale; +import org.pf4j.Extension; + import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -33,6 +35,7 @@ * Please use the FileSystemFetcher for that. If you need more advanced control (passwords, * timeouts, proxies, etc), please use the tika-fetcher-http module. */ +@Extension public class UrlFetcher extends AbstractFetcher { @Override diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcherPlugin.java b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcherPlugin.java new file mode 100644 index 0000000000..ab73c043bc --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcherPlugin.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.url; + +import org.pf4j.Plugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class UrlFetcherPlugin extends Plugin { + private static final Logger LOG = LoggerFactory.getLogger(UrlFetcherPlugin.class); + @Override + public void start() { + LOG.info("Starting"); + super.start(); + } + + @Override + public void stop() { + LOG.info("Stopping"); + super.stop(); + } + + @Override + public void delete() { + LOG.info("Deleting"); + super.delete(); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/config/UrlFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/config/UrlFetcherConfig.java new file mode 100644 index 0000000000..e3f250a97f --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/java/org/apache/tika/pipes/fetcher/url/config/UrlFetcherConfig.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.url.config; + +import org.apache.tika.pipes.fetcher.config.FetcherConfig; + +public class UrlFetcherConfig extends FetcherConfig { + + public static final String PLUGIN_ID = "url-fetcher"; + + @Override + public String getFetcherPluginId() { + return PLUGIN_ID; + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/resources/plugin.properties b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/resources/plugin.properties new file mode 100644 index 0000000000..cc36bf1f5f --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-url/src/main/resources/plugin.properties @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +plugin.id=url-fetcher +plugin.class=org.apache.tika.pipes.fetcher.url.UrlFetcherPlugin +plugin.version=3.0.0-SNAPSHOT +plugin.provider=URL Fetcher +plugin.description=Capable of fetching URLs diff --git a/tika-pipes/tika-pipes-core/pom.xml b/tika-pipes/tika-pipes-core/pom.xml new file mode 100644 index 0000000000..84339a2eb1 --- /dev/null +++ b/tika-pipes/tika-pipes-core/pom.xml @@ -0,0 +1,203 @@ + + + + + + 4.0.0 + + + org.apache.tika + tika-pipes + 3.0.0-SNAPSHOT + ../pom.xml + + + tika-pipes-core + jar + Apache Tika Pipes core + https://tika.apache.org/ + + + + org.slf4j + slf4j-api + + + org.apache.tika + tika-core + ${project.version} + + + org.apache.tika + tika-core + ${project.version} + test-jar + test + + + org.pf4j + pf4j + + provided + + + commons-io + commons-io + + + + + + com.google.guava + guava + test + + + com.martensigwart + fakeload + ${fakeload.version} + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + ${checkstyle.plugin.version} + + + com.puppycrawl.tools + checkstyle + ${puppycrawl.version} + + + + + validate + validate + + checkstyle.xml + UTF-8 + false + true + ${project.basedir}/src/test/java + error + true + + + check + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + org.apache.tika.pipes.core + + + + + + + test-jar + + + + + + maven-failsafe-plugin + ${maven.failsafe.version} + + + + ${project.build.directory}/${project.build.finalName}.jar + + + + + + + integration-test + verify + + + + + + + + + + + org.codehaus.mojo + findbugs-maven-plugin + 3.0.5 + + -Xmx256m + 240000 + max + true + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + ${maven.project.info.reports.version} + + + + index + + + + + + + + This is the core Apache Tikaâ„¢ toolkit library for Tika Pipes. + + The Apache Software Foundation + http://www.apache.org + + + JIRA + https://issues.apache.org/jira/browse/TIKA + + + Jenkins + https://builds.apache.org/job/Tika-trunk/ + + + + 3.0.0-BETA2-rc1 + + diff --git a/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java index f8dcffb641..5d30d97c3c 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java @@ -27,6 +27,7 @@ import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.pipesiterator.TotalCountResult; +import org.apache.tika.pipes.reporter.PipesReporter; public class CompositePipesReporter extends PipesReporter implements Initializable { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java index a0f40901ba..07b5a4c69c 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java @@ -19,10 +19,10 @@ import java.io.Serializable; import java.util.Objects; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.emitter.EmitKey; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.fetcher.FetchKey; public class FetchEmitTuple implements Serializable { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesClient.java similarity index 99% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesClient.java index 0e1ca18e05..bb2a2e4ccc 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesClient.java @@ -29,6 +29,7 @@ import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -561,6 +562,9 @@ private String[] getCommandline() { commandLine.add(Long.toString(pipesConfig.getMaxForEmitBatchBytes())); commandLine.add(Long.toString(pipesConfig.getTimeoutMillis())); commandLine.add(Long.toString(pipesConfig.getShutdownClientAfterMillis())); + for (Path pluginDir : pipesConfig.getPluginDirs()) { + commandLine.add(ProcessUtils.escapeCommandLine(pluginDir.toAbsolutePath().toString())); + } LOG.debug("pipesClientId={}: commandline: {}", pipesClientId, commandLine); return commandLine.toArray(new String[0]); } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesConfig.java similarity index 91% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesConfig.java index 132e657a74..f8a21124f0 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesConfig.java @@ -20,6 +20,8 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; import java.util.Set; import org.slf4j.Logger; @@ -34,6 +36,10 @@ public class PipesConfig extends PipesConfigBase { private long maxWaitForClientMillis = 60000; public static PipesConfig load(Path tikaConfig) throws IOException, TikaConfigException { + return load(tikaConfig, new ArrayList<>()); + } + + public static PipesConfig load(Path tikaConfig, List pluginDirs) throws IOException, TikaConfigException { PipesConfig pipesConfig = new PipesConfig(); try (InputStream is = Files.newInputStream(tikaConfig)) { Set settings = pipesConfig.configure("pipes", is); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java similarity index 96% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java index 83ad11e9ed..ad97740946 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java @@ -60,6 +60,7 @@ public class PipesConfigBase extends ConfigBase { private int staleFetcherDelaySeconds = DEFAULT_STALE_FETCHER_DELAY_SECONDS; private List forkedJvmArgs = new ArrayList<>(); private Path tikaConfig; + private List pluginDirs; private String javaPath = "java"; public long getTimeoutMillis() { @@ -189,4 +190,12 @@ public int getStaleFetcherDelaySeconds() { public void setStaleFetcherDelaySeconds(int staleFetcherDelaySeconds) { this.staleFetcherDelaySeconds = staleFetcherDelaySeconds; } + + public List getPluginDirs() { + return pluginDirs; + } + + public void setPluginDirs(List pluginDirs) { + this.pluginDirs = pluginDirs; + } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesException.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesException.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesException.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesParser.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesParser.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesResult.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesResult.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesServer.java similarity index 97% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesServer.java index dffb7c9ce2..8fd82af364 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -28,6 +28,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Optional; @@ -35,6 +36,7 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; +import org.pf4j.PluginManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; @@ -48,6 +50,7 @@ import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.EmbeddedDocumentByteStoreExtractorFactory; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.extractor.EmbeddedDocumentBytesHandler; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; @@ -70,10 +73,10 @@ import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.emitter.StreamEmitter; import org.apache.tika.pipes.emitter.TikaEmitterException; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.extractor.EmittingEmbeddedDocumentBytesHandler; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.plugin.TikaPluginManager; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; @@ -89,7 +92,6 @@ * the PipesClient. */ public class PipesServer implements Runnable { - private static final Logger LOG = LoggerFactory.getLogger(PipesServer.class); //this has to be some number not close to 0-3 @@ -143,10 +145,12 @@ public static STATUS lookup(int val) { private volatile boolean parsing; private volatile long since; + private PluginManager pluginManager; + private List pluginDirs; public PipesServer(Path tikaConfigPath, InputStream in, PrintStream out, long maxForEmitBatchBytes, long serverParseTimeoutMillis, - long serverWaitTimeoutMillis) + long serverWaitTimeoutMillis, List pluginDirs) throws IOException, TikaException, SAXException { this.tikaConfigPath = tikaConfigPath; this.input = new DataInputStream(in); @@ -156,19 +160,23 @@ public PipesServer(Path tikaConfigPath, InputStream in, PrintStream out, this.serverWaitTimeoutMillis = serverWaitTimeoutMillis; this.parsing = false; this.since = System.currentTimeMillis(); + this.pluginDirs = pluginDirs; } - public static void main(String[] args) throws Exception { try { Path tikaConfig = Paths.get(args[0]); long maxForEmitBatchBytes = Long.parseLong(args[1]); long serverParseTimeoutMillis = Long.parseLong(args[2]); long serverWaitTimeoutMillis = Long.parseLong(args[3]); + List pluginPaths = new ArrayList<>(); + for (int i = 4; i < args.length; ++i) { + pluginPaths.add(Paths.get(args[i])); + } PipesServer server = new PipesServer(tikaConfig, System.in, System.out, maxForEmitBatchBytes, - serverParseTimeoutMillis, serverWaitTimeoutMillis); + serverParseTimeoutMillis, serverWaitTimeoutMillis, pluginPaths); System.setIn(UnsynchronizedByteArrayInputStream.builder().setByteArray(new byte[0]).get()); System.setOut(System.err); Thread watchdog = new Thread(server, "Tika Watchdog"); @@ -455,10 +463,6 @@ private Fetcher getFetcher(FetchEmitTuple t) { LOG.warn(noFetcherMsg); write(STATUS.FETCHER_NOT_FOUND, noFetcherMsg); return null; - } catch (IOException | TikaException e) { - LOG.warn("Couldn't initialize fetcher for fetch id '" + t.getId() + "'", e); - write(STATUS.FETCHER_INITIALIZATION_EXCEPTION, ExceptionUtils.getStackTrace(e)); - return null; } } @@ -743,8 +747,11 @@ private FetchEmitTuple readFetchEmitTuple() { protected void initializeResources() throws TikaException, IOException, SAXException { //TODO allowed named configurations in tika config + pluginManager = pluginDirs == null || pluginDirs.isEmpty() ? new TikaPluginManager() : new TikaPluginManager(pluginDirs); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); this.tikaConfig = new TikaConfig(tikaConfigPath); - this.fetcherManager = FetcherManager.load(tikaConfigPath); + this.fetcherManager = FetcherManager.load(pluginManager); //skip initialization of the emitters if emitting //from the pipesserver is turned off. if (maxForEmitBatchBytes > -1) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java index bc55cca5db..b13ba64d96 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java @@ -23,7 +23,7 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.PipesConfigBase; -import org.apache.tika.pipes.PipesReporter; +import org.apache.tika.pipes.reporter.PipesReporter; public class AsyncConfig extends PipesConfigBase { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java similarity index 99% rename from tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java index 3a6751f4ff..e0939f6a2b 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java @@ -37,13 +37,13 @@ import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.PipesClient; import org.apache.tika.pipes.PipesException; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.emitter.EmitData; import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.pipesiterator.PipesIterator; import org.apache.tika.pipes.pipesiterator.TotalCountResult; import org.apache.tika.pipes.pipesiterator.TotalCounter; +import org.apache.tika.pipes.reporter.PipesReporter; /** * This is the main class for handling async requests. This manages diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java similarity index 94% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java index c748541afb..c8b98de177 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java @@ -19,10 +19,12 @@ import java.io.IOException; import java.util.List; +import org.pf4j.ExtensionPoint; + import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public interface Emitter { +public interface Emitter extends ExtensionPoint { String getName(); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/TikaEmitterException.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/TikaEmitterException.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/emitter/TikaEmitterException.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/emitter/TikaEmitterException.java diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/exception/PipesRuntimeException.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/exception/PipesRuntimeException.java new file mode 100644 index 0000000000..45f4982b86 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/exception/PipesRuntimeException.java @@ -0,0 +1,22 @@ +package org.apache.tika.pipes.exception; + +public class PipesRuntimeException extends RuntimeException { + public PipesRuntimeException() { + } + + public PipesRuntimeException(String message) { + super(message); + } + + public PipesRuntimeException(String message, Throwable cause) { + super(message, cause); + } + + public PipesRuntimeException(Throwable cause) { + super(cause); + } + + public PipesRuntimeException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java index 07c9f7507f..7577da8879 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java @@ -22,6 +22,7 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.AbstractEmbeddedDocumentBytesHandler; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.FetchEmitTuple; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java similarity index 77% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java index 0b417e3fb1..71a1cd394a 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java @@ -16,29 +16,23 @@ */ package org.apache.tika.pipes.fetcher; -import org.apache.tika.config.Field; - - public abstract class AbstractFetcher implements Fetcher { - private String name; + private String pluginId; public AbstractFetcher() { } - public AbstractFetcher(String name) { - this.name = name; + public AbstractFetcher(String pluginId) { + this.pluginId = pluginId; } - @Override - public String getName() { - return name; + public String getPluginId() { + return pluginId; } - @Field - public void setName(String name) { - this.name = name; + public void setPluginId(String pluginId) { + this.pluginId = pluginId; } - } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java similarity index 91% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java index d64f815244..60bb81d6a0 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java @@ -25,9 +25,11 @@ public class EmptyFetcher implements Fetcher { + public static final String PLUGIN_ID = "empty-fetcher"; + @Override - public String getName() { - return "empty"; + public String getPluginId() { + return PLUGIN_ID; } @Override diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java similarity index 92% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java index 8f7a186fd5..dd8ee695a7 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java @@ -19,6 +19,8 @@ import java.io.IOException; import java.io.InputStream; +import org.pf4j.ExtensionPoint; + import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -30,9 +32,9 @@ *

* Implementations of Fetcher must be thread safe. */ -public interface Fetcher { +public interface Fetcher extends ExtensionPoint { - String getName(); + String getPluginId(); InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException; } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java new file mode 100644 index 0000000000..d0a532b499 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher; + +import java.util.Set; +import java.util.stream.Collectors; + +import org.pf4j.PluginManager; + +import org.apache.tika.config.ConfigBase; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.pipes.exception.PipesRuntimeException; +import org.apache.tika.pipes.plugin.TikaPluginManager; + +/** + * Utility class to hold multiple fetchers. + *

+ * This forbids multiple fetchers supporting the same name. + */ +public class FetcherManager extends ConfigBase { + private final PluginManager pluginManager; + + public FetcherManager() throws TikaConfigException { + pluginManager = new TikaPluginManager(); + } + + public FetcherManager(PluginManager pluginManager) { + this.pluginManager = pluginManager; + } + + public static FetcherManager load(PluginManager pluginManager) { + return new FetcherManager(pluginManager); + } + + public Fetcher getFetcher(String pluginId) { + return pluginManager.getExtensions(Fetcher.class, pluginId) + .stream() + .findFirst() + .orElseThrow(() -> new PipesRuntimeException("Could not find Fetcher extension for plugin " + pluginId)); + } + + public Set getSupported() { + return pluginManager.getExtensions(Fetcher.class) + .stream() + .map(Fetcher::getPluginId) + .collect(Collectors.toSet()); + } + + /** + * Convenience method that returns a fetcher if only one fetcher + * is specified in the tika-config file. If 0 or > 1 fetchers + * are specified, this throws an IllegalArgumentException. + * @return + */ + public Fetcher getFetcher() { + return pluginManager.getExtensions(Fetcher.class) + .stream() + .findFirst() + .orElseThrow(() -> new PipesRuntimeException("Could not find any instances of the Fetcher extension")); + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfig.java new file mode 100644 index 0000000000..400b644005 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfig.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetcher.config; + +public abstract class FetcherConfig { + private String fetcherId; + + abstract public String getFetcherPluginId(); + + public void setFetcherPluginId(String fetcherPluginId) { + // no op - we put this here to appease the ConfigBase + } + + public String getFetcherId() { + return fetcherId; + } + + public FetcherConfig setFetcherId(String fetcherId) { + this.fetcherId = fetcherId; + return this; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfigContainer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfigContainer.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfigContainer.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/fetcher/config/FetcherConfigContainer.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java index 34706f7e88..e142ffc2b0 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java @@ -41,6 +41,7 @@ import org.apache.tika.exception.TikaTimeoutException; import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.HandlerConfig; +import org.apache.tika.pipes.pipesiterator.fs.IPipesIterator; import org.apache.tika.sax.BasicContentHandlerFactory; /** @@ -51,7 +52,7 @@ * next() is called after hasNext() has returned false. */ public abstract class PipesIterator extends ConfigBase - implements Callable, Iterable, Initializable { + implements IPipesIterator, Callable, Iterable, Initializable { public static final long DEFAULT_MAX_WAIT_MS = 300_000; public static final int DEFAULT_QUEUE_SIZE = 1000; @@ -177,9 +178,7 @@ protected HandlerConfig getHandlerConfig() { return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached); } - - protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException; - + protected void tryToAdd(FetchEmitTuple p) throws InterruptedException, TimeoutException { added++; boolean offered = queue.offer(p, maxWaitMs, TimeUnit.MILLISECONDS); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCountResult.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCountResult.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCountResult.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCountResult.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java similarity index 100% rename from tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java similarity index 97% rename from tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java index 75cb8390cc..c28db99890 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java @@ -59,7 +59,7 @@ public class FileListPipesIterator extends PipesIterator implements Initializabl private Path fileListPath; @Override - protected void enqueue() throws IOException, TimeoutException, InterruptedException { + public void enqueue() throws IOException, TimeoutException, InterruptedException { try (BufferedReader reader = Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) { if (hasHeader) { reader.readLine(); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java index 967df73b99..34a61b428a 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java @@ -72,7 +72,7 @@ public void setBasePath(String basePath) { } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { if (!Files.isDirectory(basePath)) { throw new IllegalArgumentException( "\"basePath\" directory does not exist: " + basePath.toAbsolutePath()); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/IPipesIterator.java similarity index 73% rename from tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/IPipesIterator.java index a1c7e48734..acd34f23c4 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/config/AbstractConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/IPipesIterator.java @@ -14,8 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes.fetcher.config; +package org.apache.tika.pipes.pipesiterator.fs; -public abstract class AbstractConfig { - // Nothing to do here yet. +import java.io.IOException; +import java.util.concurrent.TimeoutException; + +import org.pf4j.ExtensionPoint; + +public interface IPipesIterator extends ExtensionPoint { + void enqueue() throws IOException, TimeoutException, InterruptedException; } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java new file mode 100644 index 0000000000..2d1a0a3fd6 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/plugin/ClasspathPluginPropertiesFinder.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.plugin; + +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.pf4j.PropertiesPluginDescriptorFinder; + +public class ClasspathPluginPropertiesFinder extends PropertiesPluginDescriptorFinder { + @Override + protected Path getPropertiesPath(Path pluginPath, String propertiesFileName) { + Path propertiesPath = super.getPropertiesPath(pluginPath, propertiesFileName); + if (!propertiesPath.toFile().exists()) { + // If in development mode, we can also pull the plugin.properties from $pluginDir/src/main/resources/plugin.properties + propertiesPath = Paths.get(propertiesPath.getParent().toAbsolutePath().toString(), "src", "main", "resources", "plugin.properties"); + } + return propertiesPath; + } +} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/plugin/TikaPluginManager.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/plugin/TikaPluginManager.java new file mode 100644 index 0000000000..b71bae1dd0 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/plugin/TikaPluginManager.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.plugin; + +import java.nio.file.Path; +import java.util.List; + +import org.pf4j.DefaultPluginManager; +import org.pf4j.PluginDescriptorFinder; +import org.pf4j.PluginLoader; +import org.pf4j.PluginWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.pipes.exception.PipesRuntimeException; +import org.apache.tika.pipes.fetcher.Fetcher; + +public class TikaPluginManager extends DefaultPluginManager { + private static final Logger LOGGER = LoggerFactory.getLogger(TikaPluginManager.class); + public TikaPluginManager() { + } + + public TikaPluginManager(Path... pluginsRoots) { + super(pluginsRoots); + } + + public TikaPluginManager(List pluginsRoots) { + super(pluginsRoots); + } + + @Override + protected PluginDescriptorFinder createPluginDescriptorFinder() { + return new ClasspathPluginPropertiesFinder(); + } + + @Override + protected PluginLoader createPluginLoader() { + return super.createPluginLoader(); + } + + @Override + public void loadPlugins() { + super.loadPlugins(); + LOGGER.info("Loaded {} plugins", getPlugins().size()); + } + + @Override + public void startPlugins() { + super.startPlugins(); + for (PluginWrapper plugin : getStartedPlugins()) { + LOGGER.info("Add-in " + plugin.getPluginId() + " : " + plugin.getDescriptor() + " has started."); + checkFetcherExtensions(plugin); + } + } + + private void checkFetcherExtensions(PluginWrapper plugin) { + for (Class extensionClass : getExtensionClasses(Fetcher.class, plugin.getPluginId())) { + if (!Fetcher.class.isAssignableFrom(extensionClass)) { + throw new PipesRuntimeException("Something is wrong with the classpath. " + Fetcher.class.getName() + + " should be assignable from " + extensionClass.getName() + + ". Did tika-core accidentally get in your plugin lib?"); + } + LOGGER.info(" Extension " + extensionClass + " has been registered to plugin " + plugin.getPluginId()); + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/PipesReporter.java similarity index 95% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/PipesReporter.java index 3978039b40..1a9c165d03 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/PipesReporter.java @@ -14,11 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes; +package org.apache.tika.pipes.reporter; import java.io.Closeable; import java.io.IOException; +import org.apache.tika.pipes.FetchEmitTuple; +import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.pipesiterator.TotalCountResult; /** diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/PipesReporterBase.java similarity index 98% rename from tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/PipesReporterBase.java index 3dcddfa71e..6cdf89767f 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/PipesReporterBase.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes; +package org.apache.tika.pipes.reporter; import java.util.HashSet; import java.util.List; @@ -26,6 +26,7 @@ import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.pipes.PipesResult; /** * Base class that includes filtering by {@link PipesResult.STATUS} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/logging/LoggingPipesReporter.java similarity index 88% rename from tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java rename to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/logging/LoggingPipesReporter.java index 5f00880ba0..fe61d86ddc 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/reporter/logging/LoggingPipesReporter.java @@ -14,12 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.pipes; +package org.apache.tika.pipes.reporter.logging; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.pipes.FetchEmitTuple; +import org.apache.tika.pipes.PipesResult; +import org.apache.tika.pipes.reporter.PipesReporter; + /** * Simple PipesReporter that logs everything at the debug level. */ diff --git a/tika-core/src/main/resources/pipes-fork-server-default-log4j2.xml b/tika-pipes/tika-pipes-core/src/main/resources/pipes-fork-server-default-log4j2.xml similarity index 100% rename from tika-core/src/main/resources/pipes-fork-server-default-log4j2.xml rename to tika-pipes/tika-pipes-core/src/main/resources/pipes-fork-server-default-log4j2.xml diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java similarity index 80% rename from tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java index ff80bb9160..6fd2d39c89 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java @@ -29,18 +29,22 @@ import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.pf4j.PluginManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.TikaTest; import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.emitter.EmitKey; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.fetcher.FetchKey; -import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.plugin.TikaPluginManager; public class PipesServerTest extends TikaTest { + private static final Logger LOG = LoggerFactory.getLogger(PipesServerTest.class); /** * This test is useful for stepping through the debugger on PipesServer @@ -64,16 +68,24 @@ public void testBasic(@TempDir Path tmp) throws Exception { UnsynchronizedByteArrayInputStream.builder().setByteArray(new byte[0]).get(), new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true, StandardCharsets.UTF_8.name()), - -1, 30000, 30000); + -1, 30000, 30000, null); pipesServer.initializeResources(); FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", new FetchKey("fs", "mock.xml"), new EmitKey("", "")); - Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", "tika-fetchers"); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); + PipesServer.MetadataListAndEmbeddedBytes - parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcherManager.getFetcher("file-system-fetcher")); assertEquals("5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd", parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256")); } @@ -99,7 +111,7 @@ public void testEmbeddedStreamEmitter(@TempDir Path tmp) throws Exception { UnsynchronizedByteArrayInputStream.builder().setByteArray(new byte[0]).get(), new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true, StandardCharsets.UTF_8.name()), - -1, 30000, 30000); + -1, 30000, 30000, null); pipesServer.initializeResources(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = @@ -111,9 +123,16 @@ public void testEmbeddedStreamEmitter(@TempDir Path tmp) throws Exception { FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", new FetchKey("fs", "mock.xml"), new EmitKey("", ""), new Metadata(), parseContext); - Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); + PipesServer.MetadataListAndEmbeddedBytes - parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcherManager.getFetcher()); assertEquals(2, parseData.metadataList.size()); byte[] bytes0 = @@ -155,7 +174,7 @@ public void testEmbeddedStreamEmitterLimitBytes(@TempDir Path tmp) throws Except UnsynchronizedByteArrayInputStream.builder().setByteArray(new byte[0]).get(), new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true, StandardCharsets.UTF_8.name()), - -1, 30000, 30000); + -1, 30000, 30000, null); pipesServer.initializeResources(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = @@ -168,9 +187,16 @@ public void testEmbeddedStreamEmitterLimitBytes(@TempDir Path tmp) throws Except new FetchKey("fs", "mock.xml"), new EmitKey("", ""), new Metadata(), parseContext); - Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + LOG.info("Using pf4j in development mode using plugins dir: {}", fetchersPath.toFile().getCanonicalPath()); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); + PipesServer.MetadataListAndEmbeddedBytes - parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcherManager.getFetcher()); assertEquals(2, parseData.metadataList.size()); byte[] bytes0 = diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockDigesterFactory.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockDigesterFactory.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/async/MockDigesterFactory.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockDigesterFactory.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java similarity index 96% rename from tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java index acb533ece4..ef825e81e0 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java @@ -27,14 +27,13 @@ import org.apache.tika.pipes.fetcher.Fetcher; public class MockFetcher implements Fetcher { - private static final byte[] BYTES = ("" + "" + "Nikolai Lobachevsky" + "main_content" + "").getBytes(StandardCharsets.UTF_8); @Override - public String getName() { - return "mock"; + public String getPluginId() { + return "mock-fetcher"; } @Override diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java similarity index 97% rename from tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java index 6e8308c895..2a59859785 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java @@ -20,8 +20,8 @@ import org.apache.tika.config.Field; import org.apache.tika.pipes.FetchEmitTuple; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.PipesResult; +import org.apache.tika.pipes.reporter.PipesReporter; public class MockReporter extends PipesReporter { diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java similarity index 97% rename from tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java index 9bfcd55918..99dd94edb9 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java @@ -26,7 +26,7 @@ import org.junit.jupiter.api.Test; import org.apache.tika.pipes.CompositePipesReporter; -import org.apache.tika.pipes.PipesReporter; +import org.apache.tika.pipes.reporter.PipesReporter; public class MockReporterTest { diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/config/TikaPipesConfigTest.java similarity index 69% rename from tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/config/TikaPipesConfigTest.java index 3ea1e538ce..392dbc611f 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java +++ b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/config/TikaPipesConfigTest.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.config; +package org.apache.tika.pipes.config; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -22,61 +22,22 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.List; import org.junit.jupiter.api.Test; +import org.apache.tika.config.AbstractTikaConfigTest; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.CompositePipesReporter; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.async.AsyncConfig; import org.apache.tika.pipes.async.MockReporter; import org.apache.tika.pipes.emitter.Emitter; import org.apache.tika.pipes.emitter.EmitterManager; -import org.apache.tika.pipes.fetcher.Fetcher; -import org.apache.tika.pipes.fetcher.FetcherManager; -import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher; import org.apache.tika.pipes.pipesiterator.PipesIterator; +import org.apache.tika.pipes.reporter.PipesReporter; public class TikaPipesConfigTest extends AbstractTikaConfigTest { //this handles tests for the newer pipes type configs. - - @Test - public void testFetchers() throws Exception { - FetcherManager m = FetcherManager.load(getConfigFilePath("fetchers-config.xml")); - Fetcher f1 = m.getFetcher("fs1"); - assertEquals(Paths.get("/my/base/path1"), ((FileSystemFetcher) f1).getBasePath()); - - Fetcher f2 = m.getFetcher("fs2"); - assertEquals(Paths.get("/my/base/path2"), ((FileSystemFetcher) f2).getBasePath()); - } - - @Test - public void testDuplicateFetchers() throws Exception { - //can't have two fetchers with the same name - assertThrows(TikaConfigException.class, () -> { - FetcherManager.load(getConfigFilePath("fetchers-duplicate-config.xml")); - }); - } - - @Test - public void testNoNameFetchers() throws Exception { - //can't have two fetchers with an empty name - assertThrows(TikaConfigException.class, () -> { - FetcherManager.load(getConfigFilePath("fetchers-noname-config.xml")); - }); - } - - @Test - public void testNoBasePathFetchers() throws Exception { - //no basepath is allowed as of > 2.3.0 - //test that this does not throw an exception. - - FetcherManager fetcherManager = FetcherManager.load( - getConfigFilePath("fetchers-nobasepath-config.xml")); - } - @Test public void testEmitters() throws Exception { EmitterManager emitterManager = diff --git a/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java b/tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java similarity index 100% rename from tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java rename to tika-pipes/tika-pipes-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java diff --git a/tika-pipes/tika-pipes-core/src/test/resources/log4j2.xml b/tika-pipes/tika-pipes-core/src/test/resources/log4j2.xml new file mode 100644 index 0000000000..5f946e6e5c --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/test/resources/log4j2.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-3941.xml b/tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/TIKA-3941.xml similarity index 100% rename from tika-core/src/test/resources/org/apache/tika/pipes/TIKA-3941.xml rename to tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/TIKA-3941.xml diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml b/tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml similarity index 100% rename from tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml rename to tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml b/tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml similarity index 100% rename from tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml rename to tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3507.xml b/tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3507.xml similarity index 100% rename from tika-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3507.xml rename to tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3507.xml diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3865.xml b/tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3865.xml similarity index 100% rename from tika-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3865.xml rename to tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/async/TIKA-3865.xml diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml b/tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml similarity index 100% rename from tika-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml rename to tika-pipes/tika-pipes-core/src/test/resources/org/apache/tika/pipes/tika-sample-config.xml diff --git a/tika-pipes/tika-pipes-iterators/pom.xml b/tika-pipes/tika-pipes-iterators/pom.xml index 2106a1b045..44f46a27b9 100644 --- a/tika-pipes/tika-pipes-iterators/pom.xml +++ b/tika-pipes/tika-pipes-iterators/pom.xml @@ -44,6 +44,20 @@ tika-pipes-iterator-az-blob + + + org.apache.tika + tika-pipes-core + ${project.version} + + + org.pf4j + pf4j + + provided + + + 3.0.0-BETA2-rc1 diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-az-blob/src/main/java/org/apache/tika/pipes/pipesiterator/azblob/AZBlobPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-az-blob/src/main/java/org/apache/tika/pipes/pipesiterator/azblob/AZBlobPipesIterator.java index 0c5d6840dc..f0bb0373fe 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-az-blob/src/main/java/org/apache/tika/pipes/pipesiterator/azblob/AZBlobPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-az-blob/src/main/java/org/apache/tika/pipes/pipesiterator/azblob/AZBlobPipesIterator.java @@ -86,7 +86,7 @@ public void setPrefix(String prefix) { } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); long start = System.currentTimeMillis(); diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java index e9c0065700..77cd4da3e8 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-csv/src/main/java/org/apache/tika/pipes/pipesiterator/csv/CSVPipesIterator.java @@ -112,7 +112,7 @@ public void setCsvPath(Path csvPath) { } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); try (Reader reader = Files.newBufferedReader(csvPath, charset)) { diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java index 248d2461e1..4a15a44b25 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-gcs/src/main/java/org/apache/tika/pipes/pipesiterator/gcs/GCSPipesIterator.java @@ -91,7 +91,7 @@ public void checkInitialization(InitializableProblemHandler problemHandler) thro } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); long start = System.currentTimeMillis(); diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-jdbc/src/main/java/org/apache/tika/pipes/pipesiterator/jdbc/JDBCPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-jdbc/src/main/java/org/apache/tika/pipes/pipesiterator/jdbc/JDBCPipesIterator.java index 2c178e1475..cdc647b89e 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-jdbc/src/main/java/org/apache/tika/pipes/pipesiterator/jdbc/JDBCPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-jdbc/src/main/java/org/apache/tika/pipes/pipesiterator/jdbc/JDBCPipesIterator.java @@ -139,7 +139,7 @@ public void setQueryTimeoutSeconds(int seconds) { } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); FetchEmitKeyIndices fetchEmitKeyIndices = null; diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java index 6d3ceb6c28..3e190a091c 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java @@ -45,7 +45,7 @@ public class JsonPipesIterator extends PipesIterator implements Initializable { private Path jsonPath; @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { try (BufferedReader reader = Files.newBufferedReader(jsonPath, StandardCharsets.UTF_8)) { String line = reader.readLine(); while (line != null) { diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-kafka/src/main/java/org/apache/tika/pipes/pipesiterator/kafka/KafkaPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-kafka/src/main/java/org/apache/tika/pipes/pipesiterator/kafka/KafkaPipesIterator.java index 9fbebcfdaf..4589c97de0 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-kafka/src/main/java/org/apache/tika/pipes/pipesiterator/kafka/KafkaPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-kafka/src/main/java/org/apache/tika/pipes/pipesiterator/kafka/KafkaPipesIterator.java @@ -147,7 +147,7 @@ public void checkInitialization(InitializableProblemHandler problemHandler) thro } @Override - protected void enqueue() throws InterruptedException, TimeoutException { + public void enqueue() throws InterruptedException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); long start = System.currentTimeMillis(); diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java index 38fc1889cf..e11ff614de 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-s3/src/main/java/org/apache/tika/pipes/pipesiterator/s3/S3PipesIterator.java @@ -181,7 +181,7 @@ public void checkInitialization(InitializableProblemHandler problemHandler) thro } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); long start = System.currentTimeMillis(); diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java index 9ecead289b..a02a1d4e0d 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java @@ -170,7 +170,7 @@ public void setProxyPort(int proxyPort) { } @Override - protected void enqueue() throws InterruptedException, IOException, TimeoutException { + public void enqueue() throws InterruptedException, IOException, TimeoutException { String fetcherName = getFetcherName(); String emitterName = getEmitterName(); diff --git a/tika-pipes/tika-pipes-reporters/pom.xml b/tika-pipes/tika-pipes-reporters/pom.xml index 13ea50a4db..9d0e7d2e94 100644 --- a/tika-pipes/tika-pipes-reporters/pom.xml +++ b/tika-pipes/tika-pipes-reporters/pom.xml @@ -37,6 +37,14 @@ tika-pipes-reporter-jdbc + + + org.apache.tika + tika-pipes-core + ${project.version} + + + 3.0.0-BETA2-rc1 diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/main/java/org/apache/tika/pipes/reporters/fs/FileSystemStatusReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/main/java/org/apache/tika/pipes/reporters/fs/FileSystemStatusReporter.java index b48745a6c6..6b456182ce 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/main/java/org/apache/tika/pipes/reporters/fs/FileSystemStatusReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/main/java/org/apache/tika/pipes/reporters/fs/FileSystemStatusReporter.java @@ -40,10 +40,10 @@ import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.FetchEmitTuple; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.async.AsyncStatus; import org.apache.tika.pipes.pipesiterator.TotalCountResult; +import org.apache.tika.pipes.reporter.PipesReporter; import org.apache.tika.utils.ExceptionUtils; /** diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/test/java/org/apache/tika/pipes/reporters/fs/TestFileSystemStatusReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/test/java/org/apache/tika/pipes/reporters/fs/TestFileSystemStatusReporter.java index 16296fa1cf..42684e5775 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/test/java/org/apache/tika/pipes/reporters/fs/TestFileSystemStatusReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-fs-status/src/test/java/org/apache/tika/pipes/reporters/fs/TestFileSystemStatusReporter.java @@ -40,11 +40,11 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.async.AsyncStatus; import org.apache.tika.pipes.pipesiterator.PipesIterator; import org.apache.tika.pipes.pipesiterator.TotalCountResult; +import org.apache.tika.pipes.reporter.PipesReporter; public class TestFileSystemStatusReporter { diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java index e31c0dc5a2..0c4f671db8 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/main/java/org/apache/tika/pipes/reporters/jdbc/JDBCPipesReporter.java @@ -43,8 +43,8 @@ import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.FetchEmitTuple; -import org.apache.tika.pipes.PipesReporterBase; import org.apache.tika.pipes.PipesResult; +import org.apache.tika.pipes.reporter.PipesReporterBase; import org.apache.tika.utils.StringUtils; /** diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/test/java/org/apache/tika/pipes/reporters/jdbc/TestJDBCPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/test/java/org/apache/tika/pipes/reporters/jdbc/TestJDBCPipesReporter.java index 01d903c5ef..54dab5cfe4 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/test/java/org/apache/tika/pipes/reporters/jdbc/TestJDBCPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-jdbc/src/test/java/org/apache/tika/pipes/reporters/jdbc/TestJDBCPipesReporter.java @@ -49,12 +49,12 @@ import org.junit.jupiter.api.io.TempDir; import org.apache.tika.pipes.FetchEmitTuple; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.async.AsyncConfig; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.pipesiterator.TotalCountResult; +import org.apache.tika.pipes.reporter.PipesReporter; public class TestJDBCPipesReporter { diff --git a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-opensearch/src/main/java/org/apache/tika/pipes/reporters/opensearch/OpenSearchPipesReporter.java b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-opensearch/src/main/java/org/apache/tika/pipes/reporters/opensearch/OpenSearchPipesReporter.java index 7dbe136218..fb2aa279cb 100644 --- a/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-opensearch/src/main/java/org/apache/tika/pipes/reporters/opensearch/OpenSearchPipesReporter.java +++ b/tika-pipes/tika-pipes-reporters/tika-pipes-reporter-opensearch/src/main/java/org/apache/tika/pipes/reporters/opensearch/OpenSearchPipesReporter.java @@ -37,8 +37,8 @@ import org.apache.tika.metadata.ExternalProcess; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.FetchEmitTuple; -import org.apache.tika.pipes.PipesReporter; import org.apache.tika.pipes.PipesResult; +import org.apache.tika.pipes.reporter.PipesReporter; import org.apache.tika.utils.StringUtils; /** diff --git a/tika-serialization/pom.xml b/tika-pipes/tika-serialization/pom.xml similarity index 93% rename from tika-serialization/pom.xml rename to tika-pipes/tika-serialization/pom.xml index bfc12cb12f..de6e9f51d2 100644 --- a/tika-serialization/pom.xml +++ b/tika-pipes/tika-serialization/pom.xml @@ -24,9 +24,9 @@ org.apache.tika - tika-parent + tika-pipes 3.0.0-SNAPSHOT - ../tika-parent/pom.xml + ../pom.xml tika-serialization @@ -47,6 +47,12 @@ ${project.version} provided + + ${project.groupId} + tika-pipes-core + ${project.version} + provided + com.fasterxml.jackson.core jackson-core diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/PrettyMetadataKeyComparator.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/PrettyMetadataKeyComparator.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/PrettyMetadataKeyComparator.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/PrettyMetadataKeyComparator.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonDeserializer.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonDeserializer.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonDeserializer.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonDeserializer.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/TikaJsonSerializer.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaSerializationException.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/TikaSerializationException.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/TikaSerializationException.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/TikaSerializationException.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonEmitData.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonEmitData.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonEmitData.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonEmitData.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTuple.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTuple.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTuple.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTuple.java diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleList.java b/tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleList.java similarity index 100% rename from tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleList.java rename to tika-pipes/tika-serialization/src/main/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleList.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TikaJsonSerializationTest.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/TikaJsonSerializationTest.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/TikaJsonSerializationTest.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/TikaJsonSerializationTest.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassA.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassA.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassA.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassA.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassB.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassB.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassB.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassB.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassC.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassC.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassC.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/mocks/ClassC.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleListTest.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleListTest.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleListTest.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleListTest.java diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleTest.java b/tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleTest.java similarity index 100% rename from tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleTest.java rename to tika-pipes/tika-serialization/src/test/java/org/apache/tika/serialization/pipes/JsonFetchEmitTupleTest.java diff --git a/tika-serialization/src/test/resources/config/tika-config-json.xml b/tika-pipes/tika-serialization/src/test/resources/config/tika-config-json.xml similarity index 100% rename from tika-serialization/src/test/resources/config/tika-config-json.xml rename to tika-pipes/tika-serialization/src/test/resources/config/tika-config-json.xml diff --git a/tika-server/tika-server-client/pom.xml b/tika-server/tika-server-client/pom.xml index 2bc9d5fb02..9963d8fa6b 100644 --- a/tika-server/tika-server-client/pom.xml +++ b/tika-server/tika-server-client/pom.xml @@ -32,6 +32,11 @@ tika-core ${project.version} + + org.apache.tika + tika-pipes-core + ${project.version} + ${project.groupId} tika-serialization @@ -132,4 +137,4 @@ 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index 7f604b55ec..63a3399f42 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -41,6 +41,11 @@ tika-core ${project.version} + + org.apache.tika + tika-pipes-core + ${project.version} + ${project.groupId} tika-translate @@ -130,6 +135,11 @@ org.apache.logging.log4j log4j-slf4j2-impl + + org.pf4j + pf4j + provided + @@ -335,4 +345,4 @@ 3.0.0-BETA2-rc1 - \ No newline at end of file + diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java index e7f1d210ef..77943bca17 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/FetcherStreamFactory.java @@ -108,7 +108,7 @@ public InputStream getInputStream(InputStream is, Metadata metadata, HttpHeaders Fetcher fetcher = fetcherManager.getFetcher(fetcherName); if (fetchRangeStart > -1 && fetchRangeEnd > -1 && !(fetcher instanceof RangeFetcher)) { throw new IllegalArgumentException( - "Can't call a fetch with a range on a fetcher that" + " is not a RangeFetcher: name=" + fetcher.getName() + " class=" + fetcher.getClass()); + "Can't call a fetch with a range on a fetcher that" + " is not a RangeFetcher: pluginId=" + fetcher.getPluginId() + " class=" + fetcher.getClass()); } return fetcher.fetch(fetchKey, metadata, parseContext); } catch (TikaException e) { diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java index 6c7cc97f76..4537a5ae29 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java @@ -206,7 +206,7 @@ private static ServerDetails initServer(TikaServerConfig tikaServerConfig) throw FetcherManager fetcherManager = null; InputStreamFactory inputStreamFactory = null; if (tikaServerConfig.isEnableUnsecureFeatures()) { - fetcherManager = FetcherManager.load(tikaServerConfig.getConfigPath()); + fetcherManager = new FetcherManager(); inputStreamFactory = new FetcherStreamFactory(fetcherManager); } else { inputStreamFactory = new DefaultInputStreamFactory(); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java index 79107476eb..84e485c510 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java @@ -40,6 +40,7 @@ import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; @@ -48,7 +49,6 @@ import org.apache.tika.pipes.async.OfferLargerThanQueueSize; import org.apache.tika.pipes.emitter.EmitData; import org.apache.tika.pipes.emitter.EmitterManager; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.serialization.pipes.JsonFetchEmitTupleList; diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceFetcherTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceFetcherTest.java index 47219afa4e..235ad21815 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceFetcherTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceFetcherTest.java @@ -34,10 +34,11 @@ import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.pf4j.PluginManager; -import org.apache.tika.exception.TikaConfigException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.apache.tika.pipes.plugin.TikaPluginManager; import org.apache.tika.server.core.resource.TikaResource; import org.apache.tika.server.core.writer.JSONMessageBodyWriter; @@ -81,10 +82,16 @@ protected InputStream getTikaConfigInputStream() throws IOException { @Override protected InputStreamFactory getInputStreamFactory(InputStream is) { + System.setProperty("pf4j.mode", "development"); // Development mode lets you work from source dir easier. + Path fetchersPath = Path.of("..", ".."); + PluginManager pluginManager = new TikaPluginManager(fetchersPath); + pluginManager.loadPlugins(); + pluginManager.startPlugins(); + FetcherManager fetcherManager = FetcherManager.load(pluginManager); + try (TikaInputStream tis = TikaInputStream.get(is)) { - FetcherManager fetcherManager = FetcherManager.load(tis.getPath()); return new FetcherStreamFactory(fetcherManager); - } catch (IOException | TikaConfigException e) { + } catch (IOException e) { throw new RuntimeException(e); } } diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index 36dc60a3c9..5ddca0492f 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -49,6 +49,7 @@ import org.junit.jupiter.api.io.TempDir; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; @@ -56,7 +57,6 @@ import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.HandlerConfig; import org.apache.tika.pipes.emitter.EmitKey; -import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.fetcher.FetcherManager; import org.apache.tika.sax.BasicContentHandlerFactory;