diff --git a/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/CLI.java b/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/CLI.java index 4f62744ee77..a4311f6d2ff 100644 --- a/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/CLI.java +++ b/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/CLI.java @@ -60,6 +60,11 @@ public class CLI { public static void main(String... arguments) { + + // There's no easy, better way :( + // Setting the usage-width to 100 chars so that URLs are not line-wrapped. + System.setProperty("picocli.usage.width", "100"); + System.exit(runMain(arguments)); } diff --git a/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/options/IcebergOptions.java b/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/options/IcebergOptions.java index 1074ea2a475..908cc4bca89 100644 --- a/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/options/IcebergOptions.java +++ b/gc/gc-tool/src/main/java/org/projectnessie/gc/tool/cli/options/IcebergOptions.java @@ -24,14 +24,56 @@ public class IcebergOptions { @CommandLine.Option( names = {"-I", "--iceberg"}, split = ",", - description = "Iceberg properties used to configure the FileIO.") + description = { + "Iceberg properties used to configure the FileIO.", + "The following properties are almost always required.", + "", + "For S3:", + "- s3.access-key-id", + "- s3.secret-access-key", + "- s3.endpoint, if you use an S3 compatible object store like MinIO", + "", + "For GCS:", + "- io-impl=org.apache.iceberg.gcp.gcs.GCSFileIO", + "- gcs.project-id", + "- gcs.oauth2.token", + "", + "For ADLS:", + "- io-impl=org.apache.iceberg.azure.adlsv2.ADLSFileIO", + "- adls.auth.shared-key.account.name", + "- adls.auth.shared-key.account.key", + }) Map icebergProperties = new HashMap<>(); @CommandLine.Option( names = {"-H", "--hadoop"}, split = ",", - description = - "Hadoop configuration option, required when using an Iceberg FileIO that is not S3.") + description = { + "Hadoop configuration option, required when using an Iceberg FileIO that is not S3FileIO.", + "The following configuration settings might be required.", + "", + "For S3:", + "- fs.s3.impl=org.apache.hadoop.fs.s3a.S3AFileSystem", + "- fs.s3a.access.key", + "- fs.s3a.secret.key", + "- fs.s3a.endpoint, if you use an S3 compatible object store like MinIO", + "", + "For GCS:", + "- fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem", + "- fs.AbstractFileSystem.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", + "- fs.gs.project.id", + "- fs.gs.auth.type=USER_CREDENTIALS", + "- fs.gs.auth.client.id", + "- fs.gs.auth.client.secret", + "- fs.gs.auth.refresh.token", + "", + "For ADLS:", + "- fs.azure.impl=org.apache.hadoop.fs.azure.AzureNativeFileSystemStore", + "- fs.AbstractFileSystem.azure.impl=org.apache.hadoop.fs.azurebfs.Abfs", + "- fs.azure.storage.emulator.account.name", + "- fs.azure.account.auth.type=SharedKey", + "- fs.azure.account.key.=", + }) Map hadoopConf = new HashMap<>(); public Map getIcebergProperties() { diff --git a/site/in-dev/gc.md b/site/in-dev/gc.md index 6e3f635c5f3..ec5f6a92348 100644 --- a/site/in-dev/gc.md +++ b/site/in-dev/gc.md @@ -41,40 +41,10 @@ java -jar nessie-gc.jar --help You should see the following output: -```text -Usage: nessie-gc.jar [-hV] [COMMAND] - -h, --help Show this help message and exit. - -V, --version Print version information and exit. -Commands: - help Display help information about the specified - command. - mark-live, identify, mark Run identify-live-content phase of Nessie GC, - must not be used with the in-memory - contents-storage. - sweep, expire Run expire-files + delete-orphan-files phase - of Nessie GC using a live-contents-set - stored by a previous run of the mark-live - command, must not be used with the in-memory - contents-storage. - gc Run identify-live-content and expire-files + - delete-orphan-files. - list List existing live-sets, must not be used with - the in-memory contents-storage. - delete Delete a live-set, must not be used with the - in-memory contents-storage. - list-deferred List files collected as deferred deletes, must - not be used with the in-memory - contents-storage. - deferred-deletes Delete files collected as deferred deletes, - must not be used with the in-memory - contents-storage. - show Show information of a live-content-set, must - not be used with the in-memory - contents-storage. - show-sql-create-schema-script Print DDL statements to create the schema. - create-sql-schema JDBC schema creation. - completion-script Extracts the command-line completion script. -``` +{% include './generated-docs/gc-help.md' %} + +!!! info + Help for all Nessie GC tool commands are [below on this page](#nessie-gc-tool-commands) The following example assumes that you have a Nessie server running at `http://localhost:19120` and a PostgreSQL instance running at `jdbc:postgresql://localhost:5432/nessie_gc` with user `pguser` and @@ -188,12 +158,62 @@ spec: EOF ``` -# Nessie GC for Nessie Administrators +## Nessie GC Tool commands + +{% include './generated-docs/gc-help.md' %} + +Below is the output of the Nessie GC tool help for all commands. + +### `mark-live`, `identify`, `mark` + +{% include './generated-docs/gc-help-mark.md' %} + +### `sweep`, `expire` + +{% include './generated-docs/gc-help-sweep.md' %} + +### `gc` + +{% include './generated-docs/gc-help-gc.md' %} + +### `list` + +{% include './generated-docs/gc-help-list.md' %} + +### `delete` + +{% include './generated-docs/gc-help-delete.md' %} + +### `list-deferred` + +{% include './generated-docs/gc-help-list-deferred.md' %} + +### `deferred-deletes` + +{% include './generated-docs/gc-help-deferred-deletes.md' %} + +### `show` + +{% include './generated-docs/gc-help-show.md' %} + +### `show-sql-create-schema-script` + +{% include './generated-docs/gc-help-show-sql-create-schema-script.md' %} + +### `create-sql-schema` + +{% include './generated-docs/gc-help-create-sql-schema.md' %} + +### `completion-script` + +{% include './generated-docs/gc-help-completion-script.md' %} + +## Nessie GC for Nessie Administrators Please refer to the [Garbage Collection](../guides/management.md#garbage-collection) documentation for information on how to run the Nessie GC on a regular basis in production. -# Nessie GC Internals +## Nessie GC Internals The rest of this document describes the internals of the Nessie GC tool and is intended for developers who want to understand how the tool works. @@ -217,7 +237,7 @@ Modules that supplement the `gc-base` module: The `gc-tool` module is a command-line interface, a standalone tool provided as an executable, it is an uber jar prefixed with a shell script, and can still be executed with `java -jar ...`. -## Basic Nessie-GC functionality +### Basic Nessie-GC functionality Nessie-GC implements a mark-and-sweep approach, a two-phase process: @@ -230,7 +250,7 @@ versions of a `Content` are scanned to identify the set of live data files. Afte base-location(s) are scanned and all files that are not in the set of live data files are deleted. The "sweep phase" is implemented by `DefaultLocalExpire`. -## Inner workings +### Inner workings To minimize the amount of data needed to match against the set of live data files for a `Content`, the implementation does not actually remember all individual data files, like maintaining a @@ -240,7 +260,7 @@ Both the "mark" (identify live contents) and "sweep" (identify and delete expire provide a configurable _parallelism_: the number of concurrently scanned named references can be configured and the amount of concurrently processed tables can be configured. -### _Mark_ phase optimization +#### _Mark_ phase optimization The implementation that walks the commit logs can be configured with a `VisitedDeduplicator`, which is meant to reduce the work required during the "mark" phase, if the commit to be examined has @@ -252,7 +272,7 @@ and/or has to walk many commits. This `DefaultVisitedDeduplicator` is present, b mentioned concerns _not_ available in the Nessie GC tool and the use of `DefaultVisitedDeduplicator` is not supported at all, and not recommended. -## Identified live contents repository +### Identified live contents repository It is recommended to use an external database for the Nessie GC repository. This is especially recommended for big Nessie repositories. @@ -261,7 +281,7 @@ Nessie GC runs against small-ish repositories do technically work with an in-mem But, as the term "in memory" suggests, the identified live-contents-set, its state, duration, etc. cannot be inspected afterwards. -## Pluggable code +### Pluggable code Different parts / functionalities are quite isolated and abstracted to allow proper unit-testability and also allow reuse of similar functionality. @@ -274,7 +294,7 @@ Examples of abstracted/isolated functionality: * Getting all data files for a specific content reference (think: Iceberg table snapshot) * Commit-log-scanning duplicate work elimination -## File references +### File references All files (or objects, in case of an object store like S3) are described using a `FileReference`, using a _base_ URI plus a URI _relative_ to the base URI. Noteworthy: the "sweep phase", which @@ -287,7 +307,7 @@ stores do not know about directories, further Iceberg's `FileIO` does not know a either. For file systems that do support directories this means, that empty directories will not be deleted, and prematurely deleting directories could break concurrent operations. -## Runtime requirements +### Runtime requirements Nessie GC work is dominated by network and/or disk I/O, less by CPU and heap pressure. @@ -304,7 +324,7 @@ Memory requirements (rough estimates): * An in-memory live-contents-repository (**not recommended for production workloads**) requires memory for all content-references. -### CPU & heap pressure testing +#### CPU & heap pressure testing Special "tests" ([this](https://github.com/projectnessie/nessie/blob/main/gc/gc-base/src/test/java/org/projectnessie/gc/huge/TestManyObjects.java) and ([this](https://github.com/projectnessie/nessie/blob/main/gc/gc-iceberg-inttest/src/test/java/org/projectnessie/gc/iceberg/inttest/ITHuge.java)) have @@ -315,12 +335,12 @@ implementation requires little memory and little CPU - runtime is largely domina _put_ and _maybe-contains_ operations for the per-content-expire runs. Both tests proved the concept. -## Deferred deletion +### Deferred deletion The default behavior is to immediately deletes orphan files. But it is also possible to record the files to be deleted and delete those later. The `nessie-gc.jar` tool supports deferred deletion. -## Non-Nessie use cases +### Non-Nessie use cases Although all the above is designed for Nessie, it is possible to reuse the core implementation with "plain" Iceberg, effectively a complete replacement of Iceberg's _expire snapshots_ and _delete @@ -333,7 +353,7 @@ orphan files_, but without Iceberg's implicit requirement of using Spark. Things * Existing functionality, the mark-and-sweep logic and the code in `nessie-gc-iceberg` and `nessie-gc-iceberg-files`, can be reused without any changes. -## Potential future enhancements +### Potential future enhancements Since Nessie GC keeps track of all ever live content-references and all ever known base content locations, it is possible to identify ... @@ -343,7 +363,7 @@ locations, it is possible to identify ... * ... the content references (aka Iceberg snapshots) are no longer used. This information can be used to no longer expose the affected e.g. Iceberg snapshots in any table metadata. -### Completely unreferenced contents +#### Completely unreferenced contents Files of contents that are not visible from any live Nessie commit can be completely removed. Detecting this situation is not _directly_ supported by the above approach. @@ -357,7 +377,7 @@ the data files, manifests, etc were stored. The above must not purge files for content IDs that have just been recently created. -## Potential Iceberg specific enhancements +### Potential Iceberg specific enhancements Nessie GC can easily identify the Iceberg snapshots, as each Nessie commit references exactly one Iceberg table snapshot. Nessie (the runtime/server) has no knowledge of whether a particular diff --git a/tools/doc-generator/site-gen/build.gradle.kts b/tools/doc-generator/site-gen/build.gradle.kts index 8c9689cdb3d..2dc1289e7a8 100644 --- a/tools/doc-generator/site-gen/build.gradle.kts +++ b/tools/doc-generator/site-gen/build.gradle.kts @@ -15,6 +15,7 @@ */ import java.io.ByteArrayOutputStream +import java.io.InputStream plugins { `java-library` @@ -26,6 +27,7 @@ val genProjects by configurations.creating val genSources by configurations.creating val cliGrammar by configurations.creating val doclet by configurations.creating +val gcRunner by configurations.creating val cliRunner by configurations.creating val genProjectPaths = listOf( @@ -61,6 +63,8 @@ dependencies { } cliRunner(project(":nessie-cli")) + + gcRunner(nessieProject("nessie-gc-tool")) } val generatedMarkdownDocsDir = layout.buildDirectory.dir("generatedMarkdownDocs") @@ -150,9 +154,64 @@ val cliHelp = tasks.register("cliHelp") { } } +val gcHelpDir = layout.buildDirectory.dir("gcHelp") + +val gcHelp = tasks.register("gcHelp") { + mainClass = "-jar" + + inputs.files(gcRunner) + outputs.dir(gcHelpDir) + + classpath(gcRunner) + + val gcMainClass = "org.projectnessie.gc.tool.cli.CLI" + + mainClass = gcMainClass + args("--help") + + doFirst { + delete(gcHelpDir) + } + + standardInput = InputStream.nullInputStream() + standardOutput = ByteArrayOutputStream() + + doLast { + gcHelpDir.get().asFile.mkdirs() + + file(gcHelpDir.get().file("gc-help.md")).writeText("```\n$standardOutput\n```\n") + + for (cmd in listOf( + "mark", + "sweep", + "gc", + "list", + "delete", + "list-deferred", + "deferred-deletes", + "show", + "show-sql-create-schema-script", + "create-sql-schema", + "completion-script" + )) { + logger.info("Generating GC command help for '$cmd' ...") + val capture = ByteArrayOutputStream() + javaexec { + mainClass = gcMainClass + classpath(gcRunner) + standardInput = InputStream.nullInputStream() + standardOutput = capture + args("help", cmd) + } + file(gcHelpDir.get().file("gc-help-$cmd.md")).writeText("```\n$capture\n```\n") + } + } +} + tasks.register("generateDocs") { dependsOn(generatedMarkdownDocs) dependsOn(cliHelp) + dependsOn(gcHelp) val targetDir = layout.buildDirectory.dir("markdown-docs") @@ -167,6 +226,7 @@ tasks.register("generateDocs") { from(generatedMarkdownDocsDir) from(cliHelpDir) + from(gcHelpDir) from(provider { zipTree(cliGrammar.singleFile) }) { include("org/projectnessie/nessie/cli/syntax/*.md") eachFile {