From 8568e091bdb0ec4a82a13e33e98c84c88b6c9f6c Mon Sep 17 00:00:00 2001 From: Christopher Bradford Date: Thu, 26 Sep 2019 02:32:40 -0400 Subject: [PATCH] Added initial documentation for the Kafka CDC connector --- README.md | 6 +- kafka-connector-cdc/.gitignore | 1 + kafka-connector-cdc/README.md | 89 ++ kafka-connector-cdc/cassandra.yaml | 1405 +++++++++++++++++++++ kafka-connector-cdc/connector-config.json | 10 + kafka-connector-cdc/docker-compose.yml | 169 +++ kafka-connector-cdc/dse.yaml | 1118 ++++++++++++++++ 7 files changed, 2797 insertions(+), 1 deletion(-) create mode 100644 kafka-connector-cdc/.gitignore create mode 100644 kafka-connector-cdc/README.md create mode 100644 kafka-connector-cdc/cassandra.yaml create mode 100644 kafka-connector-cdc/connector-config.json create mode 100644 kafka-connector-cdc/docker-compose.yml create mode 100644 kafka-connector-cdc/dse.yaml diff --git a/README.md b/README.md index cf65776..671590e 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,11 @@ Refer to the [dse-k8s-operator](dse-k8s-operator) directory. ### DataStax Java Driver Spring Boot Starter Labs -Refer to the [spring-boot-starter](./spring-boot-starter) directory. +Refer to the [spring-boot-starter](spring-boot-starter) directory. + +### DataStax Kafka Connector CDC Source + +Refer to the [kafka-connector-cdc](kafka-connector-cdc) directory. ## Support diff --git a/kafka-connector-cdc/.gitignore b/kafka-connector-cdc/.gitignore new file mode 100644 index 0000000..ba0d4d9 --- /dev/null +++ b/kafka-connector-cdc/.gitignore @@ -0,0 +1 @@ +kafka-connect-dse-2.0.0-20190925-LABS.jar diff --git a/kafka-connector-cdc/README.md b/kafka-connector-cdc/README.md new file mode 100644 index 0000000..7c9169f --- /dev/null +++ b/kafka-connector-cdc/README.md @@ -0,0 +1,89 @@ +# DataStax Kafka Connector Source Demo + +## Install Dependencies + +1. Install Docker +2. Install Docker Compose +3. Download kafaka-connect-dse-2.0.0-20190925-LABS.jar from DataStax Labs + +## Start the components +3. Start up the stack `docker-compose up -d` + +## Configure DSE +4. Configure DSE schema + + ``` + docker-compose exec dse cqlsh + ``` + + ``` + CREATE KEYSPACE demo_ks WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1}; + CREATE TABLE demo_ks.demo_table ( + k TEXT, + v TEXT, + PRIMARY KEY ((k)) + ); + ``` + +5. Enable DSE Advanced Replication Destination for Kafka + + ``` + docker-compose exec dse dse advrep destination create --name demo_destination --transmission-enabled true + docker-compose exec dse dse advrep destination list + ``` + +6. Enable DSE Advanced Replication Channel for `demo_ks.demo_table` + + ``` + docker-compose exec dse dse advrep channel create --data-center-id dc1 --source-keyspace demo_ks --source-table demo_table --destination demo_destination --transmission-enabled true --collection-enabled true + docker-compose exec dse dse advrep channel status + ``` + +## Configure Kafka and the Connector +7. Connect to the Confluent Control Panel http://localhost:9021/ +8. Select the only cluster +9. Click "Topics" in the left sidebar +10. Click "Add a topic" in the top right corner +11. Enter the following parameters then click "Create with defaults" + + Topic name: demo-topic + Number of partitions: 1 + +12. Open "Connect" in the left sidebar +13. Click "connect-default" +14. Click "Add Connector" +15. Click "Connect" under "DseSourceConnector" +16. Enter the following parameters and click "Continue" + + Name: demo-connector + Tasks max: 1 + topic: demo-topic + destination: demo_destination + contact_points: dse +17. Verify configuration parameters and click "Launch" + +## Insert data to be replicated +18. Start `cqlsh` and insert data + + ``` + docker-compose exec dse cqlsh + ``` + + ``` + INSERT INTO demo_ks.demo_table (k, v) VALUES ('a', 'b'); + INSERT INTO demo_ks.demo_table (k, v) VALUES ('c', 'd'); + INSERT INTO demo_ks.demo_table (k, v) VALUES ('e', 'f'); + ``` + +## Validate behavior + +1. Look in to number of messages waiting to be replicated + + ``` + docker-compose exec dse dse advrep replog count --source-keyspace demo_ks --source-table demo_table --destination demo_destination + ``` +2. Navigate to the topic view in the confluent control center +3. Click "Topics" in the left side bar +4. Select "demo-topic" +5. Validate messages are being produced and consumed +6. Optionally look at the messages diff --git a/kafka-connector-cdc/cassandra.yaml b/kafka-connector-cdc/cassandra.yaml new file mode 100644 index 0000000..f184c96 --- /dev/null +++ b/kafka-connector-cdc/cassandra.yaml @@ -0,0 +1,1405 @@ +# cassandra.yaml is the main storage configuration file for DataStax Enterprise (DSE). + +# NOTE: +# See the DataStax Enterprise documentation at https://docs.datastax.com/ +# /NOTE + +# The name of the cluster. This is mainly used to prevent machines in +# one logical cluster from joining another. +cluster_name: 'Test Cluster' + +# The number of tokens randomly assigned to this node on the ring. +# The higher the token count is relative to other nodes, the larger the proportion of data +# that this node will store. You probably want all nodes to have the same number +# of tokens assuming they have equal hardware capability. +# +# If not set, the default value is 1 token for backward compatibility +# and will use the initial_token as described below. +# +# Specifying initial_token will override this setting on the node's initial start. +# On subsequent starts, this setting will apply even if initial token is set. +# +# If you already have a cluster with 1 token per node, and want to migrate to +# multiple tokens per node, see http://wiki.apache.org/cassandra/Operations +# num_tokens: 128 + +# Triggers automatic allocation of num_tokens tokens for this node. The allocation +# algorithm attempts to choose tokens in a way that optimizes replicated load over +# the nodes in the datacenter for the specified DC-level replication factor. +# +# The load assigned to each node will be close to proportional to its number of +# vnodes. +# +# Supported only with the Murmur3Partitioner. +# allocate_tokens_for_local_replication_factor: 3 + +# initial_token allows you to specify tokens manually. To use with +# vnodes (num_tokens > 1, above), provide a +# comma-separated list of tokens. This option is primarily used when adding nodes to legacy clusters +# that do not have vnodes enabled. +# initial_token: + +# See http://wiki.apache.org/cassandra/HintedHandoff +# True to enable globally, false to disable globally. +hinted_handoff_enabled: true + +# When hinted_handoff_enabled is true, a black list of data centers that will not +# perform hinted handoff. Other datacenters not listed will perform hinted handoffs. +# hinted_handoff_disabled_datacenters: +# - DC1 +# - DC2 + +# Maximum amount of time during which the database generates hints for an unresponsive node. +# After this interval, the database does not generate any new hints for the node until it is +# back up and responsive. If the node goes down again, the database starts a new interval. This setting +# can prevent a sudden demand for resources when a node is brought back online and the rest of the +# cluster attempts to replay a large volume of hinted writes. +max_hint_window_in_ms: 10800000 # 3 hours + +# Maximum throttle in KBs per second per delivery thread. This will be +# reduced proportionally to the number of nodes in the cluster. If there +# are two nodes in the cluster, each delivery thread will use the maximum +# rate; if there are three, each will throttle to half of the maximum, +# since we expect two nodes to be delivering hints simultaneously. +hinted_handoff_throttle_in_kb: 1024 + +# Number of threads with which to deliver hints; +# Consider increasing this number when you have multi-dc deployments, since +# cross-dc handoff tends to be slower +max_hints_delivery_threads: 2 + +# Directory to store hints. +# If not set, the default directory is $DSE_HOME/data/hints. +hints_directory: /var/lib/cassandra/hints + +# How often to flush hints from the internal buffers to disk. +# Will *not* trigger fsync. +hints_flush_period_in_ms: 10000 + +# Maximum size, in MB, for a single hints file. +max_hints_file_size_in_mb: 128 + +# Compression to apply to the hint files. If omitted, hints files +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +#hints_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Maximum throttle in KBs per second, total. This will be +# reduced proportionally to the number of nodes in the cluster. +batchlog_replay_throttle_in_kb: 1024 + +# Strategy to choose the batchlog storage endpoints. +# +# Available options: +# +# - random_remote +# Default, purely random. Prevents the local rack, if possible. Same behavior as earlier releases. +# +# - dynamic_remote +# Uses DynamicEndpointSnitch to select batchlog storage endpoints. Prevents the +# local rack, if possible. This strategy offers the same availability guarantees +# as random_remote, but selects the fastest endpoints according to the DynamicEndpointSnitch. +# (DynamicEndpointSnitch tracks reads but not writes. Write-only, +# or mostly-write, workloads might not benefit from this strategy. +# Note: this strategy will fall back to random_remote if dynamic_snitch is not enabled. +# +# - dynamic +# Mostly the same as dynamic_remote, except that local rack is not excluded, which offers lower +# availability guarantee than random_remote or dynamic_remote. +# Note: this strategy will fall back to random_remote if dynamic_snitch is not enabled. +# +# batchlog_endpoint_strategy: random_remote + +# DataStax Enterprise (DSE) provides the DseAuthenticator for external authentication +# with multiple authentication schemes such as Kerberos, LDAP, and internal authentication. +# Additional configuration is required in dse.yaml for enabling authentication. +# If using DseAuthenticator, DseRoleManager must also be used (see below). +# +# All other authenticators, including org.apache.cassandra.auth.{AllowAllAuthenticator, +# PasswordAuthenticator} are deprecated, and some security features may not work +# correctly if they are used. +authenticator: com.datastax.bdp.cassandra.auth.DseAuthenticator + +# DataStax Enterprise (DSE) provides the DseAuthorizer which must be used in place +# of the CassandraAuthorizer if the DseAuthenticator is being used. It allows +# enhanced permission management of DSE specific resources. +# Additional configuration is required in dse.yaml for enabling authorization. +# +# All other authorizers, including org.apache.cassandra.auth.{AllowAllAuthorizer, +# CassandraAuthorizer} are deprecated, and some security features may not work +# correctly if they are used. +authorizer: com.datastax.bdp.cassandra.auth.DseAuthorizer + +# DataStax Enterprise (DSE) provides the DseRoleManager that supports LDAP roles +# as well as the internal roles supported by CassandraRoleManager. The DseRoleManager +# stores role options in the dse_security keyspace. +# Please increase the dse_security keyspace replication factor when using this role +# manager. Additional configuration is required in dse.yaml. +# +# All other role managers, including CassandraRoleManager are deprecated, and some +# security features might not work correctly if they are used. +role_manager: com.datastax.bdp.cassandra.auth.DseRoleManager + +# Whether to enable system keyspace filtering so that users can access and view +# only schema information for rows in the system and system_schema keyspaces to +# which they have access. Security requirements and user permissions apply. +# Enable this feature only after appropriate user permissions are granted. +# +# See Managing keyspace and table permissions at +# https://docs.datastax.com/en/dse/6.7/dse-admin/datastax_enterprise/security/secSystemKeyspaces.html +# +# Default: false +system_keyspaces_filtering: false + +# Validity period for roles cache (fetching granted roles can be an expensive +# operation depending on the role manager) +# Granted roles are cached for authenticated sessions in AuthenticatedUser and +# after the period specified here, become eligible for (async) reload. +# Defaults to 120000, set to 0 to disable caching entirely. +# Will be disabled automatically if internal authentication is disabled +# when using DseAuthenticator. +roles_validity_in_ms: 120000 + +# Refresh interval for roles cache (if enabled). +# After this interval, cache entries become eligible for refresh. On next +# access, an async reload is scheduled and returns the old value until the reload +# completes. If roles_validity_in_ms is non-zero, then this value must be non-zero +# also. +# Defaults to the same value as roles_validity_in_ms. +# roles_update_interval_in_ms: 2000 + +# Validity period for permissions cache (fetching permissions can be an +# expensive operation depending on the authorizer). +# Defaults to 120000, set to 0 to disable. +# Will be disabled automatically if authorization is disabled when +# using DseAuthorizer. +permissions_validity_in_ms: 120000 + +# Refresh interval for permissions cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If permissions_validity_in_ms is non-zero, then this value must also be +# non-zero. +# Defaults to the same value as permissions_validity_in_ms. +# permissions_update_interval_in_ms: 2000 + +# The partitioner is responsible for distributing groups of rows (by +# partition key) across nodes in the cluster. You should leave this +# alone for new clusters. The partitioner can NOT be changed without +# reloading all data, so when upgrading you should set this to the +# same partitioner you were already using. +# +# Besides Murmur3Partitioner, partitioners included for backwards +# compatibility include RandomPartitioner, ByteOrderedPartitioner, and +# OrderPreservingPartitioner. +# +partitioner: org.apache.cassandra.dht.Murmur3Partitioner + +# Directories where the database should store data on disk. The data +# is spread evenly across the directories, subject to the granularity of +# the configured compaction strategy. +# If not set, the default directory is $DSE_HOME/data/data. +data_file_directories: + - /var/lib/cassandra/data + +# Metadata directory that holds information about the cluster, local node and its peers. +# Currently, only a single subdirectory called 'nodes' will be used. +# If not set, the default directory is $CASSANDRA_HOME/data/metadata. +metadata_directory: /var/lib/cassandra/metadata + +# Commit log directory. When running on magnetic HDD, this directory should be on a +# separate spindle than the data directories. +# If not set, the default directory is $DSE_HOME/data/commitlog. +commitlog_directory: /var/lib/cassandra/commitlog + +# Whether to enable CDC functionality on a per-node basis. CDC functionality modifies the logic used +# for write path allocation rejection. When false (standard behavior), never reject. When true (use cdc functionality), +# reject mutation that contains a CDC-enabled table if at space limit threshold in cdc_raw_directory. +cdc_enabled: true + +# CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the +# segment contains mutations for a CDC-enabled table. This directory should be placed on a +# separate spindle than the data directories. If not set, the default directory is +# $DSE_HOME/data/cdc_raw. +cdc_raw_directory: /var/lib/cassandra/cdc_raw + +# Policy for data disk failures: +# +# die +# shut down gossip and client transports and kill the JVM for any fs errors or +# single-sstable errors, so the node can be replaced. +# +# stop_paranoid +# shut down gossip and client transports even for single-sstable errors, +# kill the JVM for errors during startup. +# +# stop +# shut down gossip and client transports, leaving the node effectively dead, but +# can still be inspected via JMX, kill the JVM for errors during startup. +# +# best_effort +# stop using the failed disk and respond to requests based on +# remaining available sstables. This means you WILL see obsolete +# data at CL.ONE! +# +# ignore +# ignore fatal errors and let requests fail, as in pre-1.2 Cassandra +disk_failure_policy: stop + +# Policy for commit disk failures: +# +# die +# shut down the node and kill the JVM, so the node can be replaced. +# +# stop +# shut down the node, leaving the node effectively dead, node +# can still be inspected via JMX. +# +# stop_commit +# shutdown the commit log, letting writes collect but +# continuing to service reads, as in pre-2.0.5 Cassandra +# +# ignore +# ignore fatal errors and let the batches fail +commit_failure_policy: stop + +# Maximum size of the native protocol prepared statement cache. +# +# Note that specifying a too large value will result in long running GCs and possbily +# out-of-memory errors. Keep the value at a small fraction of the heap. +# +# If you constantly see "prepared statements discarded in the last minute because +# cache limit reached" messages, the first step is to investigate the root cause +# of these messages and check whether prepared statements are used correctly - +# i.e. use bind markers for variable parts. +# +# Change the default value only if there are more prepared statements than +# fit in the cache. In most cases, it is not neccessary to change this value. +# Constantly re-preparing statements is a performance penalty. +# +# Valid value is a number greater than 0. When not set, the default is calculated. +# +# The default calculated value is 1/256th of the heap or 10 MB, whichever is greater. +prepared_statements_cache_size_mb: + +# Row cache implementation class name. Available implementations: +# +# org.apache.cassandra.cache.OHCProvider +# Fully off-heap row cache implementation (default). +# +# org.apache.cassandra.cache.SerializingCacheProvider +# This is the row cache implementation availabile +# in previous releases of Cassandra. +# row_cache_class_name: org.apache.cassandra.cache.OHCProvider + +# Maximum size of the row cache in memory. +# OHC cache implementation requires additional off-heap memory to manage +# the map structures and additional in-flight memory during operations before/after cache entries can be +# accounted against the cache capacity. This overhead is usually small compared to the whole capacity. +# Do not specify more memory that the system can afford in the worst usual situation and leave some +# headroom for OS block level cache. Never allow your system to swap. +# +# Default value is 0 to disable row caching. +row_cache_size_in_mb: 0 + +# Duration in seconds after which the database should save the row cache. +# Caches are saved to saved_caches_directory as specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 0 to disable saving the row cache. +row_cache_save_period: 0 + +# Number of keys from the row cache to save. +# Specify 0 (which is the default), meaning all keys are going to be saved +# row_cache_keys_to_save: 100 + +# Maximum size of the counter cache in memory. +# +# Counter cache helps to reduce counter locks' contention for hot counter cells. +# In case of RF = 1 a counter cache hit will cause the database to skip the read before +# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration +# of the lock hold, helping with hot counter cell updates, but will not allow skipping +# the read entirely. Only the local (clock, count) tuple of a counter cell is kept +# in memory, not the whole counter, so it's relatively cheap. +# +# NOTE: if you reduce the size, you might not get the hottest keys loaded on startup. +# +# When not set, the default value is calculated (min(2.5% of Heap (in MB), 50MB)). +# Set to 0 to disable counter cache. +# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache. +counter_cache_size_in_mb: + +# Duration in seconds after which the database should +# save the counter cache (keys only). Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Default is 7200 (2 hours). +counter_cache_save_period: 7200 + +# Number of keys from the counter cache to save. +# Disabled by default. When commented out (disabled), all keys are saved. +# counter_cache_keys_to_save: 100 + +# Saved caches directory. +# If not set, the default directory is $DSE_HOME/data/saved_caches. +saved_caches_directory: /var/lib/cassandra/saved_caches + +# commitlog_sync +# Valid commitlog_sync values are periodic, group, or batch. +# +# When in batch mode, the database won't ack writes until the commit log +# has been flushed to disk. Each incoming write will trigger the flush task. +# commitlog_sync_batch_window_in_ms is a deprecated value. Previously it had +# almost no value, and is being removed. +# +# commitlog_sync_batch_window_in_ms: 2 +# +# group mode is similar to batch mode, where the database will not ack writes +# until the commit log has been flushed to disk. The difference is group +# mode will wait up to commitlog_sync_group_window_in_ms between flushes. +# +# commitlog_sync_group_window_in_ms: 1000 +# +# The default is periodic. When in periodic mode, writes can be acked immediately +# and the CommitLog is simply synced every commitlog_sync_period_in_ms. +commitlog_sync: periodic +commitlog_sync_period_in_ms: 500 + +# The size of the individual commitlog file segments. A commitlog +# segment can be archived, deleted, or recycled after all the data +# in it (potentially from each table in the system) has been +# flushed to sstables. +# +# The default size is 32, which is almost always fine, but if you are +# archiving commitlog segments (see commitlog_archiving.properties), +# then you probably want a finer granularity of archiving; 8 or 16 MB +# is reasonable. +# Max mutation size is also configurable via max_mutation_size_in_kb setting in +# cassandra.yaml. When max_mutation_size_in_kb is not set, the calculated default is half the size +# commitlog_segment_size_in_mb * 1024. This value should be positive and less than 2048. +# +# NOTE: If max_mutation_size_in_kb is set explicitly, then commitlog_segment_size_in_mb must +# be set to at least twice the size of max_mutation_size_in_kb / 1024 +# +commitlog_segment_size_in_mb: 32 + +# Compression to apply to the commit log. +# When not set, the default compression for the commit log is uncompressed. +# LZ4, Snappy, and Deflate compressors are supported. +# commitlog_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Any class that implements the SeedProvider interface and has a +# constructor that takes a Map of parameters is valid. +seed_provider: + # Addresses of hosts that are deemed contact points. + # Database nodes use this list of hosts to find each other and learn + # the topology of the ring. You _must_ change this if you are running + # multiple nodes! + - class_name: org.apache.cassandra.locator.SimpleSeedProvider + parameters: + # seeds is actually a comma-delimited list of addresses. + # Ex: ",," + - seeds: "127.0.0.1" + +# Maximum memory used for file buffers that are stored in the file cache, also +# known as the chunk cache. This is used as a cache that holds uncompressed +# sstable chunks, potentially for a very long time (until the sstable is obsoleted +# by compaction or until the data is evicted by the cache). +# When not set, the default is calculated as 1/4 of (system RAM - max heap). +# This pool is allocated off-heap but the chunk cache also has on-heap overhead +# which is roughly 120 bytes per entry. +# Memory is allocated only when needed but is not released. +# file_cache_size_in_mb: 4096 + +# In addition to buffers stored in the file cache, buffers are also used for transient +# operations such as reading sstables (when the data to be read is larger than the file cache buffer size), +# reading hints or CRC files. Buffers used for such operations are kept in memory +# in order to avoid continuous allocations, up to this limit. +# A buffer is typically used by a read operation and then returned to this pool when the operation is finished +# so that it can be reused by other operations. +# When not set the default is 2M per core plus 2M for all other threads capped at 128 MiB. +# Memory is allocated only when needed but is not released. +# direct_reads_size_in_mb: 128 + + +# The strategy for optimizing disk read. +# Possible values are: +# ssd (for solid state disks, the default). When not set, the default is ssd. +# spinning (for spinning disks) +# disk_optimization_strategy: ssd + +# Total permitted memory to use for memtables. The database will stop +# accepting writes when the limit is exceeded until a flush completes, +# and will trigger a flush based on memtable_cleanup_threshold +# If omitted, the calculated value is 1/4 the size of the heap. +# memtable_space_in_mb: 2048 + + +# Ratio of occupied non-flushing memtable size to total permitted size +# that will trigger a flush of the largest memtable. Larger mct will +# mean larger flushes and hence less compaction, but also less concurrent +# flush activity which can make it difficult to keep your disks fed +# under heavy write load. +# +# memtable_cleanup_threshold defaults to max(0.15, 1 / (memtable_flush_writers + 1)) +# memtable_cleanup_threshold: 0.15 + +# Specify the way the database allocates and manages memtable memory. +# Options are: +# +# heap_buffers +# on heap nio buffers +# +# offheap_buffers +# off heap (direct) nio buffers +# +# offheap_objects +# off heap objects +memtable_allocation_type: offheap_objects + +# Disk usage threshold that will trigger the database to reclaim some space +# used by the commit log files. +# +# If the commit log disk usage exceeds this threshold, the database will flush +# every dirty table in the oldest segment and remove it. So a small total +# commitlog space will cause more flush activity on less-active +# tables. +# +# The default value is the smaller of 8192, and 1/4 of the total space +# of the commitlog volume. +# +# The database will still write commit logs while it reclaims space +# from previous commit logs. Therefore, the total disk space "reserved" +# for the commit log should be _at least_ 25% bigger than the value of the +# commitlog_total_space_in_mb configuration parameter. The actual +# value depends on the write workload. +# +# commitlog_total_space_in_mb: 8192 + +# The number of memtable flush writer threads per disk and +# the total number of memtables that can be flushed concurrently. +# These are generally a combination of compute and IO bound. +# +# Memtable flushing is more CPU efficient than memtable ingest and a single thread +# can keep up with the ingest rate of a whole server on a single fast disk +# until it temporarily becomes IO bound under contention typically with compaction. +# At that point you need multiple flush threads. At some point in the future +# it may become CPU bound all the time. +# +# You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation +# metric, which should be 0. A non-zero metric occurs if threads are blocked waiting on flushing +# to free memory. +# +# memtable_flush_writers defaults to 8, and this means 8 Memtables can be flushed concurrently +# to a single data directory. +# +# There is a direct tradeoff between number of memtables that can be flushed concurrently +# and flush size and frequency. More is not better you just need enough flush writers +# to never stall waiting for flushing to free memory. +# +# memtable_flush_writers: 8 + +# Total space to use for change-data-capture logs on disk. +# +# If space gets above this value, the database will throw WriteTimeoutException +# on mutations including CDC-enabled tables. A CDCCompactor is responsible +# for parsing the raw CDC logs and deleting them when parsing is completed. +# +# The default value is calculated as the min of 4096 mb and 1/8th of the total space +# of the drive where cdc_raw_directory resides. +# cdc_total_space_in_mb: 4096 + +# When the cdc_raw limit is reached and the CDCCompactor is running behind +# or experiencing backpressure, we check at the following interval to see if any +# new space for cdc-tracked tables has been made available. Default to 250ms +# cdc_free_space_check_interval_ms: 250 + +# Whether to enable periodic fsync() when doing sequential writing. When enabled, fsync() at intervals +# force the operating system to flush the dirty +# buffers. Enable to avoid sudden dirty buffer flushing from +# impacting read latencies. Almost always a good idea on SSDs; not +# necessarily on platters. +trickle_fsync: true +trickle_fsync_interval_in_kb: 10240 + +# TCP port, for commands and data. +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +storage_port: 7000 + +# SSL port, for encrypted communication. Unused unless enabled in +# encryption_options +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +ssl_storage_port: 7001 + +# Address or interface to bind to and tell other nodes to connect to. +# You _must_ change this address or interface to enable multiple nodes to communicate! +# +# Set listen_address OR listen_interface, not both. +# +# When not set (blank), InetAddress.getLocalHost() is used. This +# will always do the Right Thing _if_ the node is properly configured +# (hostname, name resolution, etc), and the Right Thing is to use the +# address associated with the hostname (it might not be). +# +# Setting listen_address to 0.0.0.0 is always wrong. +# +listen_address: localhost + +# Set listen_address OR listen_interface, not both. Interfaces must correspond +# to a single address. IP aliasing is not supported. +#listen_interface: wlan0 + +# If you specify the interface by name and the interface has an ipv4 and an ipv6 address, +# specify which address. +# If false, the first ipv4 address will be used. +# If true, the first ipv6 address will be used. +# When not set, the default is false (ipv4). +# If there is only one address, that address is selected regardless of ipv4/ipv6. +# listen_interface_prefer_ipv6: false + +# Address to broadcast to other database nodes. +# Leaving this blank will set it to the same value as listen_address +# broadcast_address: 1.2.3.4 + +# When using multiple physical network interfaces, set this +# to true to listen on broadcast_address in addition to +# the listen_address, allowing nodes to communicate in both +# interfaces. +# Do not set this property if the network configuration automatically +# routes between the public and private networks such as EC2. +# listen_on_broadcast_address: false + +# Internode authentication backend, implementing IInternodeAuthenticator; +# used to allow/disallow connections from peer nodes. +# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator + +# Whether to start the native transport server. +# The address on which the native transport is bound is defined by native_transport_address. +start_native_transport: true +# The port where the CQL native transport listens for clients. +# For security reasons, do not expose this port to the internet. Firewall it if needed. +native_transport_port: 9042 +# Enabling native transport encryption in client_encryption_options allows you to use +# encryption for the standard port or use a dedicated, additional port along with the unencrypted +# standard native_transport_port. +# If client encryption is enabled and native_transport_port_ssl is disabled, the +# native_transport_port (default: 9042) will encrypt all traffic. To use both unencrypted and encrypted +# traffic, enable native_transport_port_ssl. +# native_transport_port_ssl: 9142 +# +# The maximum size of allowed frame. Frame (requests) larger than this will +# be rejected as invalid. The default is 256 MB. If you're changing this parameter, +# you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048. +# native_transport_max_frame_size_in_mb: 256 + +# The maximum number of concurrent client connections. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections: -1 + +# The maximum number of concurrent client connections per source ip. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections_per_ip: -1 + +# Controls whether Cassandra honors older protocol versions +# The default is true, which means older protocols will be honored. +native_transport_allow_older_protocols: true + +# The address or interface to bind the native transport server to. +# +# Set native_transport_address OR native_transport_interface, not both. +# +# Leaving native_transport_address blank has the same effect as on listen_address +# (i.e. it will be based on the configured hostname of the node). +# +# Note that unlike listen_address, you can specify 0.0.0.0, but you must also +# set native_transport_broadcast_address to a value other than 0.0.0.0. +# +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +native_transport_address: localhost + +# Set native_transport_address OR native_transport_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# native_transport_interface: eth0 + +# If you specify the interface by name and the interface has an ipv4 and an ipv6 address, +# specify which address. +# If false, the first ipv4 address will be used. +# If true, the first ipv6 address will be used. +# When not set, the default is false (ipv4). +# If there is only one address, that address is selected regardless of ipv4/ipv6. +# native_transport_interface_prefer_ipv6: false + +# Native transport address to broadcast to drivers and other nodes. +# Do not set to 0.0.0.0. If left blank, this will be set to the value of +# native_transport_address. If native_transport_address is set to 0.0.0.0, native_transport_broadcast_address must +# be set. +# native_transport_broadcast_address: 1.2.3.4 + +# enable or disable keepalive on native connections +native_transport_keepalive: true + +# Uncomment to set socket buffer size for internode communication. +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it, the buffer size is defined by net.ipv4.tcp_wmem +# See also: +# /proc/sys/net/core/wmem_max +# /proc/sys/net/core/rmem_max +# /proc/sys/net/ipv4/tcp_wmem +# /proc/sys/net/ipv4/tcp_wmem +# and 'man tcp' +# internode_send_buff_size_in_bytes: + +# Uncomment to set socket buffer size for internode communication. +# Note that when setting this value, the buffer size is limited by net.core.wmem_max +# and when not setting this value, the buffer size is defined by net.ipv4.tcp_wmem +# internode_recv_buff_size_in_bytes: + +# Whether to create a hard link to each SSTable +# flushed or streamed locally in a backups/ subdirectory of the +# keyspace data. Incremental backups enable storing backups off site without transferring entire +# snapshots. The database does not automatically clear incremental backup files. +# DataStax recommends setting up a process to clear incremental backup hard links each time a new snapshot is created. +incremental_backups: false + +# Whether to enable snapshots before each compaction. +# Be careful using this option, since the database won't clean up the +# snapshots for you. A snapshot is useful to back up data when there is a data format change. +snapshot_before_compaction: false + +# Whether to enable snapshots of the data before truncating a keyspace or +# dropping a table. To prevent data loss, DataStax strongly advises using the default +# setting. If you set auto_snapshot to false, you lose data on truncation or drop. +auto_snapshot: true + +# Granularity of the collation index of rows within a partition. +# Smaller granularity means better search times, especially if +# the partition is in disk cache, but also higher size of the +# row index and the associated memory cost for keeping that cached. +# The performance of lower density nodes may benefit from decreasing +# this number to 4, 2 or 1kb. +column_index_size_in_kb: 16 + +# Threshold for the total size of all index entries for a partition that the database +# stores in the partition key cache. If the total size of all index entries for a partition +# exceeds this amount, the database stops putting entries for this partition into the partition +# key cache. +# +# Note that this size refers to the size of the +# serialized index information and not the size of the partition. +column_index_cache_size_in_kb: 2 + +# Number of simultaneous compactions allowed to run simultaneously, NOT including +# validation "compactions" for anti-entropy repair. Simultaneous +# compactions help preserve read performance in a mixed read/write +# workload by limiting the number of small SSTables that accumulate +# during a single long running compaction. When not set, the calculated default is usually +# fine. If you experience problems with compaction running too +# slowly or too fast, you should first review the +# compaction_throughput_mb_per_sec option. +# +# The calculated default value for concurrent_compactors defaults to the smaller of (number of disks, +# number of cores), with a minimum of 2 and a maximum of 8. +# +# If your data directories are backed by SSD, increase this +# to the number of cores. +#concurrent_compactors: 1 + +# Number of simultaneous repair validations to allow. Default is unbounded +# Values less than one are interpreted as unbounded (the default) +# concurrent_validations: 0 + +# Number of simultaneous materialized view builder tasks to allow. +concurrent_materialized_view_builders: 2 + +# Number of permitted concurrent lightweight transactions. +# A higher number might improve throughput if non-contending LWTs are in heavy use, +# but will use more memory and may fare worse with contention. +# +# The default value (equal to eight times the number of TPC cores) should be +# good enough for most cases. +# concurrent_lw_transactions: 128 + +# Maximum number of LWTs that can be queued up before the node starts reporting +# OverloadedException for LWTs. +# max_pending_lw_transactions: 10000 + +# Throttles compaction to the specified total throughput across the entire +# system. The faster you insert data, the faster you need to compact in +# order to keep the SSTable count down. In general, setting this to +# 16 to 32 times the rate you are inserting data is more than sufficient. +# Set to 0 to disable throttling. Note that this throughput applies for all types +# of compaction, including validation compaction. +compaction_throughput_mb_per_sec: 16 + +# The size of the SSTables to trigger preemptive opens. The compaction process opens +# SSTables before they are completely written and uses them in place +# of the prior SSTables for any range previously written. This process helps +# to smoothly transfer reads between the SSTables by reducing page cache churn and keeps hot rows hot. +# +# Setting this to a low value will negatively affect performance +# and eventually cause huge heap pressure and a lot of GC activity. +# The "optimal" value depends on the hardware and workload. +# +# Values <= 0 will disable this feature. +sstable_preemptive_open_interval_in_mb: 50 + +# With pick_level_on_streaming set to true, streamed-in sstables of tables using +# LCS (leveled comaction strategy) will be placed in the same level as on the +# source node (up-leveling may happen though). +# +# The previous behavior, and with pick_level_on_streaming set to false, the +# incoming sstables are placed in level 0. +# +# For operational tasks like 'nodetool refresh' or replacing a node, setting +# pick_level_on_streaming to true can save a lot of compaction work. +# +# Default is true +# pick_level_on_streaming: true + +# When enabled, permits Cassandra to zero-copy stream entire eligible +# SSTables between nodes, including every component. +# This speeds up the network transfer significantly subject to +# throttling specified by stream_throughput_outbound_megabits_per_sec. +# Enabling this will reduce the GC pressure on sending and receiving node. +# When unset, the default is enabled. While this feature tries to keep the +# disks balanced, it cannot guarantee it. This feature will be automatically +# disabled if internode encryption is enabled. Currently this can be used with +# Leveled Compaction. Once CASSANDRA-14586 is fixed other compaction strategies +# will benefit as well when used in combination with CASSANDRA-6696. +# +# stream_entire_sstables: true + +# Throttle, in megabits per seconds, for the throughput of all outbound streaming file transfers +# on a node. The database does mostly sequential I/O when streaming data during +# bootstrap or repair which can saturate the network connection and degrade +# client (RPC) performance. When not set, the value is 200 Mbps (25 MB/s). +# stream_throughput_outbound_megabits_per_sec: 200 + +# Throttle for all streaming file transfers between the datacenters, +# this setting allows users to throttle inter dc stream throughput in addition +# to throttling all network stream traffic as configured with +# stream_throughput_outbound_megabits_per_sec. +# When unset, the default is 200 Mbps (25 MB/s). +# inter_dc_stream_throughput_outbound_megabits_per_sec: 200 + +# How long the coordinator should wait for read operations to complete. +# Lowest acceptable value is 10 ms. This timeout does not apply to +# aggregated queries such as SELECT COUNT(*), MIN(x), etc. +read_request_timeout_in_ms: 5000 +# How long the coordinator should wait for seq or index scans to complete. +# Lowest acceptable value is 10 ms. This timeout does not apply to +# aggregated queries such as SELECT COUNT(*), MIN(x), etc. +range_request_timeout_in_ms: 10000 +# How long the coordinator should wait for aggregated read operations to complete, +# such as SELECT COUNT(*), MIN(x), etc. +aggregated_request_timeout_in_ms: 120000 +# How long the coordinator should wait for writes to complete. +# Lowest acceptable value is 10 ms. +write_request_timeout_in_ms: 2000 +# How long the coordinator should wait for counter writes to complete. +# Lowest acceptable value is 10 ms. +counter_write_request_timeout_in_ms: 5000 +# How long a coordinator should continue to retry a CAS operation +# that contends with other proposals for the same row. +# Lowest acceptable value is 10 ms. +cas_contention_timeout_in_ms: 1000 +# How long the coordinator should wait for truncates to complete +# The long default value allows the database to take a snapshot before removing the data. +# If auto_snapshot is disabled (not recommended), you can reduce this time. +# Lowest acceptable value is 10 ms. +truncate_request_timeout_in_ms: 60000 +# The default timeout for other, miscellaneous operations. +# Lowest acceptable value is 10 ms. +request_timeout_in_ms: 10000 +# Additional RTT latency between DCs applied to cross dc request. Set this property only when +# cross dc network latency is high. Value must be non-negative. +# Set this value to 0 to apply no additional RTT latency. When unset, the default is 0. +# cross_dc_rtt_in_ms: 0 + +# How long before a node logs slow queries. SELECT queries that exceed +# this timeout will generate an aggregated log message to identify slow queries. +# Set this value to zero to disable slow query logging. +slow_query_log_timeout_in_ms: 500 + +# Whether to enable operation timeout information exchange between nodes to accurately +# measure request timeouts. If disabled, replicas will assume that requests +# were forwarded to them instantly by the coordinator. During overload conditions this means extra +# time is required for processing already-timed-out requests. +# +# Warning: Before enabling this property make sure that NTP (network time protocol) is installed +# and the times are synchronized between the nodes. +cross_node_timeout: false + +# Interval to send keep-alive messages. The stream session fails when a keep-alive message +# is not received for 2 keep-alive cycles. When unset, the default is 300 seconds (5 minutes) +# so that a stalled stream times out in 10 minutes (2 cycles). +# streaming_keep_alive_period_in_secs: 300 + +# Maximum number of connections per host for streaming. +# Increase this when you notice that joins are CPU-bound rather that network- +# bound. For example, a few nodes with large files. +# streaming_connections_per_host: 1 + + +# The sensitivity of the failure detector on an exponential scale. Generally, this setting +# does not need adjusting. phi value that must be reached for a host to be marked down. +# When unset, the internal value is 8. +# phi_convict_threshold: 8 + +# When a tcp connection to another node is established, cassandra sends an echo +# request to see if the connection is actually usable. If an echo reply is not +# heard after this many tries, the connection will be destroyed and +# reestablished to try again. Each attempt roughly translates to 1 second. +# +# echo_attempts_before_reset: 10 + +# endpoint_snitch -- A class that implements the IEndpointSnitch interface. The database uses the +# snitch to locate nodes and route requests. Use only snitch implementations that are bundled with DSE. +# +# THE DATABASE WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH +# AFTER DATA IS INSERTED INTO THE CLUSTER. This would cause data loss. +# This means that if you start with the default SimpleSnitch, which +# locates every node on "rack1" in "datacenter1", your only options +# if you need to add another datacenter are GossipingPropertyFileSnitch +# (and the older PFS). From there, if you want to migrate to an +# incompatible snitch like Ec2Snitch you can do it by adding new nodes +# under Ec2Snitch (which will locate them in a new "datacenter") and +# decommissioning the old nodes. +# +# Supported snitches from Cassandra: +# +# SimpleSnitch: +# Treats Strategy order as proximity. This can improve cache +# locality when disabling read repair. Appropriate only for +# single-datacenter deployments. +# +# GossipingPropertyFileSnitch +# This should be your go-to snitch for production use. The rack +# and datacenter for the local node are defined in +# cassandra-rackdc.properties and propagated to other nodes via +# gossip. For migration from the PropertyFileSnitch, uses the cassandra-topology.properties +# file if it is present. +# +# PropertyFileSnitch: +# Proximity is determined by rack and data center, which are +# explicitly configured in cassandra-topology.properties. +# +# Ec2Snitch: +# Appropriate for EC2 deployments in a single Region. Loads Region +# and Availability Zone information from the EC2 API. The Region is +# treated as the datacenter, and the Availability Zone as the rack. +# Only private IPs are used, so this will not work across multiple +# Regions. +# +# Ec2MultiRegionSnitch: +# Uses public IPs as broadcast_address to allow cross-region +# connectivity. This means you must also set seed addresses to the public +# IP and open the storage_port or +# ssl_storage_port on the public IP firewall. For intra-Region +# traffic, the database will switch to the private IP after +# establishing a connection. +# +# RackInferringSnitch: +# Proximity is determined by rack and data center, which are +# assumed to correspond to the 3rd and 2nd octet of each node's IP +# address, respectively. Unless this happens to match your +# deployment conventions, this is best used as an example of +# writing a custom Snitch class and is provided in that spirit. +# +# DataStax Enterprise (DSE) provides: +# +# com.datastax.bdp.snitch.DseSimpleSnitch: +# Proximity is determined by DSE workload, which places transactional, +# Analytics, and Search nodes into their separate datacenters. +# Appropriate only for Development deployments. +# +endpoint_snitch: com.datastax.bdp.snitch.DseSimpleSnitch + +# How often to perform the more expensive part of host score +# calculation. Use care when reducing this interval, score calculation is CPU intensive. +dynamic_snitch_update_interval_in_ms: 100 +# How often to reset all host scores, allowing a bad host to +# possibly recover. +dynamic_snitch_reset_interval_in_ms: 600000 +# if set greater than zero, this will allow +# 'pinning' of replicas to hosts in order to increase cache capacity. +# The badness threshold will control how much worse the pinned host has to be +# before the dynamic snitch will prefer other replicas over it. This is +# expressed as a double which represents a percentage. Thus, a value of +# 0.2 means the database would continue to prefer the static snitch values +# until the pinned host was 20% worse than the fastest. +dynamic_snitch_badness_threshold: 0.1 + +# Enable or disable inter-node encryption +# JVM defaults for supported SSL socket protocols and cipher suites can +# be replaced using custom encryption options. This is not recommended +# unless you have policies in place that dictate certain settings, or +# need to disable vulnerable ciphers or protocols in case the JVM cannot +# be updated. +# FIPS compliant settings can be configured at JVM level and should not +# involve changing encryption settings here: +# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html +# *NOTE* No custom encryption options are enabled at the moment +# The available internode options are : all, none, dc, rack +# +# If set to dc, encrypt the traffic between the DCs +# If set to rack, encrypt the traffic between the racks +# +# The passwords used in these options must match the passwords used when generating +# the keystore and truststore. For instructions on generating these files, see: +# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# +# KeyStore types can be JKS, JCEKS, PKCS12 or PKCS11 +# For PKCS11 the "java.security" file must be updated to register the PKCS11 JNI binding +# and the relevant native binaries installed. +# For more information see: https://docs.oracle.com/javase/8/docs/technotes/guides/security/p11guide.html +server_encryption_options: + internode_encryption: none + keystore: resources/dse/conf/.keystore + keystore_password: cassandra + truststore: resources/dse/conf/.truststore + truststore_password: cassandra + # More advanced defaults below: + # protocol: TLS + # algorithm: SunX509 + # + # replaces the deprecated store_type for keystore, valid types can be JKS, JCEKS, PKCS12 or PKCS11 + # for file based keystores prefer PKCS12 + # keystore_type: JKS + # + # replaces the deprecated store_type for truststore, valid types can be JKS, JCEKS, PKCS12 or PKCS11 + # for file based keystores prefer PKCS12 + # truststore_type: JKS + # + # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + # require_client_auth: false + # require_endpoint_verification: false + +# enable or disable client/server encryption. +client_encryption_options: + enabled: false + # If enabled and optional is set to true, encrypted and unencrypted connections over native transport are handled. + optional: false + keystore: resources/dse/conf/.keystore + keystore_password: cassandra + # require_client_auth: false + # Set trustore and truststore_password if require_client_auth is true + # truststore: resources/dse/conf/.truststore + # truststore_password: cassandra + # More advanced defaults below: + # protocol: TLS + # algorithm: SunX509 + # + # replaces the deprecated store_type for keystore, valid types can be JKS, JCEKS, PKCS12 or PKCS11 + # for file based keystores prefer PKCS12 + # keystore_type: JKS + # + # replaces the deprecated store_type for truststore, valid types can be JKS, JCEKS, PKCS12 or PKCS11 + # for file based keystores prefer PKCS12 + # truststore_type: JKS + # + # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + +# internode_compression controls whether traffic between nodes is +# compressed. +# Can be: +# +# all +# all traffic is compressed +# +# dc +# traffic between different datacenters is compressed +# +# none +# nothing is compressed. +internode_compression: dc + +# Enable or disable tcp_nodelay for inter-dc communication. +# Disabling it will result in larger (but fewer) network packets being sent, +# reducing overhead from the TCP protocol itself, at the cost of increasing +# latency if you block for cross-datacenter responses. +inter_dc_tcp_nodelay: false + +# TTL for different trace types used during logging of the repair process. +tracetype_query_ttl: 86400 +tracetype_repair_ttl: 604800 + +# The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation. +# Lowering this value on Windows can provide much tighter latency and better throughput, however +# some virtualized environments may see a negative performance impact from changing this setting +# below their system default. The sysinternals 'clockres' tool can confirm your system's default +# setting. +windows_timer_interval: 1 + +# UDFs (user defined functions) are disabled by default. +# +# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code. +enable_user_defined_functions: false + +# Enables scripted UDFs (JavaScript UDFs). +# +# Java UDFs are always enabled, if enable_user_defined_functions is true. +# Enable this option to be able to use UDFs with "language javascript". +# This option has no effect, if enable_user_defined_functions is false. +# +# Note that JavaScript UDFs are noticeable slower and produce more garbage on the heap than Java UDFs +# and can therefore negatively affect overall database performance. +enable_scripted_user_defined_functions: false + +# Optionally disable asynchronous UDF execution. +# Note: Java UDFs are not run asynchronously. +# +# Disabling asynchronous UDF execution also implicitly disables the security-manager! +# By default, asynchronous UDF execution is enabled to be able to detect UDFs that run too long / forever and be +# able to fail fast - i.e. stop the Cassandra daemon, which is currently the only appropriate approach to +# "tell" a user that there's something really wrong with the UDF. +# When you disable async UDF execution, users MUST pay attention to read-timeouts since these timeouts might indicate +# UDFs that run too long or forever which can destabilize the cluster. +# Currently UDFs within the GROUP BY clause are allowed only when asynchronous UDF execution is disabled, +# subjected to the afforementioned security caveats. +enable_user_defined_functions_threads: true + +# Time in microseconds (CPU time) after a warning will be emitted to the log and +# to the client that a UDF runs too long. +# Java-UDFs will always emit a warning, script-UDFs only if +# enable_user_defined_functions_threads is set to true. +user_defined_function_warn_micros: 500 + +# Time in microseconds (CPU time) after a fatal UDF run-time situation is detected. +# For Java-UDFs the function is safely aborted. +# For script-UDFs the action according to user_function_timeout_policy will take place. +# Java-UDFs will always throw an exception, script-UDFs only if +# enable_user_defined_functions_threads is set to true. +user_defined_function_fail_micros: 10000 + +# If a Java UDF allocates more than user_defined_function_warn_heap_mb on the heap, +# a warning will be emitted to the log and the client. +# Java-UDFs will always emit a warning, script-UDFs only if +# enable_user_defined_functions_threads is set to true. +user_defined_function_warn_heap_mb: 200 + +# UDFs that allocate more than user_defined_function_fail_heap_mb, will fail. +# For Java-UDFs the function is safely aborted. +# For script-UDFs the action according to user_function_timeout_policy will take place. +# Java-UDFs will always throw an exception, script-UDFs only if +# enable_user_defined_functions_threads is set to true. +user_defined_function_fail_heap_mb: 500 + +# Defines what to do when a script-UDF ran longer than user_defined_function_fail_timeout. +# (Only valid, if enable_user_defined_functions_threads is set to true) +# Possible options are: +# - 'die' - i.e. it is able to emit a warning to the client before the Cassandra Daemon +# will shut down. +# - 'die_immediate' - shut down C* daemon immediately (effectively prevent the chance that +# the client will receive a warning). +# - 'ignore' - just log - the most dangerous option. +user_function_timeout_policy: die + + +# Enables encrypting data at-rest (on disk). Different key providers are supported, but the default KSKeyProvider reads from +# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by +# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys +# can still (and should!) be in the keystore and will be used on decrypt operations +# to handle key rotation. +# +# DataStax recommends installing Java Cryptography Extension (JCE) Unlimited Strength Jurisdiction +# Policy Files for your version of the JDK to ensure support of all encryption algorithms. +# See the DSE installation documentation. +# +transparent_data_encryption_options: + enabled: false + chunk_length_kb: 64 + cipher: AES/CBC/PKCS5Padding + key_alias: testing:1 + # CBC IV length for AES must be 16 bytes, the default size + # iv_length: 16 + key_provider: + - class_name: org.apache.cassandra.security.JKSKeyProvider + parameters: + - keystore: conf/.keystore + keystore_password: cassandra + store_type: JCEKS + key_password: cassandra + + +##################### +# SAFETY THRESHOLDS # +##################### + +# GC Pauses greater than 200 ms will be logged at INFO level. +# Adjust this threshold to minimize logging, if necessary. +# gc_log_threshold_in_ms: 200 + +# GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level. +# Adjust this threshold based on your application throughput requirement. +# Set to 0 to deactivate the feature. +# gc_warn_threshold_in_ms: 1000 + +# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption +# early. Any value size larger than this threshold will result in marking an SSTable +# as corrupted. This value should be positive and less than 2048. +# max_value_size_in_mb: 256 + +# Probability the database will gossip with one of the seed nodes during each round of gossip. +# Valid range is between 0.01 and 1.0 +# seed_gossip_probability: 1.0 + +# Back-pressure settings # +# If enabled, the coordinator will apply the back-pressure strategy specified below to each mutation +# sent to replicas, with the aim of reducing pressure on overloaded replicas. +back_pressure_enabled: false +# The back-pressure strategy applied. +# The default implementation, RateBasedBackPressure, takes three arguments: +# high ratio, factor, and flow type, and uses the ratio between incoming mutation responses and outgoing mutation requests. +# If below high ratio, outgoing mutations are rate limited according to the incoming rate decreased by the given factor; +# if above high ratio, the rate limiting is increased by the given factor; +# the recommended factor is a whole number between 1 and 10, use larger values for a faster recovery +# at the expense of potentially more dropped mutations; +# the rate limiting is applied according to the flow type: if FAST, it's rate limited at the speed of the fastest replica, +# if SLOW at the speed of the slowest one. +# New strategies can be added. Implementors need to implement org.apache.cassandra.net.BackpressureStrategy and +# provide a public constructor that accepts Map. +back_pressure_strategy: + - class_name: org.apache.cassandra.net.RateBasedBackPressure + parameters: + - high_ratio: 0.90 + factor: 5 + flow: FAST + +# Coalescing Strategies # +# Coalescing multiples messages turns out to significantly boost message processing throughput (think doubling or more). +# On bare metal, the floor for packet processing throughput is high enough that many applications won't notice, but in +# virtualized environments, the point at which an application can be bound by network packet processing can be +# surprisingly low compared to the throughput of task processing that is possible inside a VM. It's not that bare metal +# doesn't benefit from coalescing messages, it's that the number of packets a bare metal network interface can process +# is sufficient for many applications such that no load starvation is experienced even without coalescing. +# There are other benefits to coalescing network messages that are harder to isolate with a simple metric like messages +# per second. By coalescing multiple tasks together, a network thread can process multiple messages for the cost of one +# trip to read from a socket, and all the task submission work can be done at the same time reducing context switching +# and increasing cache friendliness of network message processing. + +# Strategy to use for coalescing messages. +# Can be fixed, movingaverage or timehorizon, and is disabled by default; enable if you want to tune for higher +# throughput, potentially at the expense of latency. +# You can also specify a subclass of CoalescingStrategies.CoalescingStrategy by name. +# otc_coalescing_strategy: DISABLED + +# How many microseconds to wait for coalescing. For fixed strategy, this is the amount of time after the first +# message is received before it will be sent with any accompanying messages. For movingaverage strategy, this is the +# maximum amount of time that will be waited as well as the interval at which messages must arrive on average +# for coalescing to be enabled. +# otc_coalescing_window_us: 100 + +# Do not try to coalesce messages if we already got that many messages. This should be between 1 and 128 (inclusive). +# otc_coalescing_enough_coalesced_messages: 32 + +# Size in KB of the direct buffer used to write messages to small/large outbound internode connections. There's one such +# buffer for every node in the cluster, per connection type. Messages larger than this buffer will require the allocation +# of a new buffer, so size this accordingly: it should be big enough to accommodate at least the average message size +# (and possibly more, to allow for batch flushing), but not too large, to avoid running out of memory. +# otc_small_max_message_buffer_kb: 64 +# otc_large_max_message_buffer_kb: 1024 + +# Continuous paging settings. When requested by the client, pages are pushed continuously to the client. +# These settings are used to calculate the maximum memory used: +# (max_concurrent_sessions * max_session_pages * max_page_size_mb). +# The default values (60 x 4 x 8) = 1920 MB of maximum memory used. The only case in which a page may be bigger than +# max_page_size_mb is if an individual CQL row is larger than this value. +continuous_paging: + # The maximum number of concurrent sessions, any additional session will be rejected with an unavailable error. + max_concurrent_sessions: 60 + # The maximum number of pages that can be buffered for each session + max_session_pages: 4 + # The maximum size of a page, in MB. If an individual CQL row is larger than this value, the page can be larger than + # this value. + max_page_size_mb: 8 + # The maximum time in milliseconds for which a local continuous query will run, assuming the client continues + # reading or requesting pages. When this threshold is exceeded, the session is swapped out and rescheduled. + # Swapping and rescheduling resources ensures the release of resources including those that prevent the memtables + # from flushing. Adjust when high write workloads exist on tables that have + # continuous paging requests. + max_local_query_time_ms: 5000 + # The maximum time the server will wait for a client to request more pages, in seconds, assuming the + # server queue is full or the client has not required any more pages via a backpressure update request. + # Increase this value for extremely large page sizes (max_page_size_mb) + # or for extremely slow networks. + client_timeout_sec: 600 + # How long the server waits for a cancel request to complete, in seconds. + cancel_timeout_sec: 5 + # How long the server will wait, in milliseconds, before checking if a continuous paging session can be resumed when + # the session is paused because of backpressure. + paused_check_interval_ms: 1 + +# Track a metric per keyspace indicating whether replication achieved the ideal consistency +# level for writes without timing out. This is different from the consistency level requested by +# each write which may be lower in order to facilitate availability. +# ideal_consistency_level: EACH_QUORUM + +# NodeSync settings. +nodesync: + # The (maximum) rate (in kilobytes per second) for data validation. + rate_in_kb: 1024 + +# Emulates DataStax Constellation database-as-a-service defaults. +# +# When enabled, some defaults (both server side and of connecting DataStax drivers) are modified to match those +# used by DataStax Constellation (DataStax cloud data platform). This includes (but is not limited to) stricter +# guardrails defaults, or the use of LOCAL_QUORUM as default consistency level. +# +# This can be used as an convenience to develop and test applications meant to run on DataStax Constellation. +# +# Warning: when enabled, the updated defaults reflect those of DataStax Constellation _at the time_ of the currently +# used DSE release. This is a best-effort emulation of said defaults. Further, all nodes must use the same +# config value. +# emulate_dbaas_defaults: false + +# Guardrails settings. +# guardrails: + # When executing a scan, within or across a partition, we need to keep the + # tombstones seen in memory so we can return them to the coordinator, which + # will use them to make sure other replicas also know about the deleted rows. + # With workloads that generate a lot of tombstones, this can cause performance + # problems and even exhaust the server heap. + # (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets) + # Adjust the thresholds here if you understand the dangers and want to + # scan more tombstones anyway. These thresholds may also be adjusted at runtime + # using the StorageService mbean. + # + # Default tombstone_warn_threshold is 1000, may differ if emulate_dbaas_defaults is enabled + # Default tombstone_failure_threshold is 100000, may differ if emulate_dbaas_defaults is enabled + # tombstone_warn_threshold: 1000 + # tombstone_failure_threshold: 100000 + + # Log a warning when compacting partitions larger than this value. + # Default value is 100mb, may differ if emulate_dbaas_defaults is enabled + # partition_size_warn_threshold_in_mb: 100 + + # Log WARN on any multiple-partition batch size that exceeds this value. 64kb per batch by default. + # Use caution when increasing the size of this threshold as it can lead to node instability. + # Default value is 64kb, may differ if emulate_dbaas_defaults is enabled + # batch_size_warn_threshold_in_kb: 64 + + # Fail any multiple-partition batch that exceeds this value. The calculated default is 640kb (10x warn threshold). + # Default value is 640kb, may differ if emulate_dbaas_defaults is enabled + # batch_size_fail_threshold_in_kb: 640 + + # Log WARN on any batches not of type LOGGED than span across more partitions than this limit. + # Default value is 10, may differ if emulate_dbaas_defaults is enabled + # unlogged_batch_across_partitions_warn_threshold: 10 + + # Failure threshold to prevent writing large column value into Cassandra. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # column_value_size_failure_threshold_in_kb: -1 + + # Failure threshold to prevent creating more columns per table than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # columns_per_table_failure_threshold: -1 + + # Failure threshold to prevent creating more fields in user-defined-type than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # fields_per_udt_failure_threshold: -1 + + # Warning threshold to warn when encountering larger size of collection data than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # collection_size_warn_threshold_in_kb: -1 + + # Warning threshold to warn when encountering more elements in collection than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # items_per_collection_warn_threshold: -1 + + # Whether read-before-write operation is allowed, eg. setting list element by index, removing list element + # by index. Note: LWT is always allowed. + # Default true to allow read before write operation, may differ if emulate_dbaas_defaults is enabled + # read_before_write_list_operations_enabled: true + + # Failure threshold to prevent creating more secondary index per table than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # secondary_index_per_table_failure_threshold: -1 + + # Failure threshold to prevent creating more materialized views per table than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # materialized_view_per_table_failure_threshold: -1 + + # Warn threshold to warn creating more tables than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # tables_warn_threshold: -1 + + # Failure threshold to prevent creating more tables than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # tables_failure_threshold: -1 + + # Preventing creating tables with provided configurations. + # Default all properties are allowed, may differ if emulate_dbaas_defaults is enabled + # table_properties_disallowed: + + # Whether to allow user-provided timestamp in write request + # Default true to allow user-provided timestamp, may differ if emulate_dbaas_defaults is enabled + # user_timestamps_enabled: true + + # Preventing query with provided consistency levels + # Default all consistency levels are allowed. + # write_consistency_levels_disallowed: + + # Failure threshold to prevent providing larger paging by bytes than threshold, also served as a hard paging limit + # when paging by rows is used. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # page_size_failure_threshold_in_kb: -1 + + # Failure threshold to prevent IN query creating size of cartesian product exceeding threshold, eg. + # "a in (1,2,...10) and b in (1,2...10)" results in cartesian product of 100. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # in_select_cartesian_product_failure_threshold: -1 + + # Failure threshold to prevent IN query containing more partition keys than threshold + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # partition_keys_in_select_failure_threshold: -1 + + # Warning threshold to warn when local disk usage exceeding threshold. Valid values: (1, 100] + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # disk_usage_percentage_warn_threshold: -1 + + # Failure threshold to reject write requests if replica disk usage exceeding threshold. Valid values: (1, 100] + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # disk_usage_percentage_failure_threshold: -1 + +# TPC settings - WARNING it is generally not advised to change these values unless directed by a performance expert + +# Number of cores used by the internal Threads Per Core architecture (TPC). This setting corresponds to the +# number of event loops that will be created internally. Do not tune. DataStax recommends contacting the DataStax Services +# team before changing this value. If unset or commented out (the default), the calculated default value is the number +# of available processors on the machine minus one. +# tpc_cores: + +# Number of cores used for reads. Do not tune. DataStax recommends contacting the DataStax Services +# team before changing this value. By default this is set to min(tpc_cores, io_global_queue_depth / 4), which means +# that each IO queue must have at least a local depth of 4 and we choose a number of IO queues, or IO cores, such that the +# combined depth does not exceed io_global_queue_depth, capped to the number of TPC cores. +# tpc_io_cores: + +# The global IO queue depth that is used for reads when AIO is enabled (the default for SSDs). +# The default value used is the value in /sys/class/block/sd[a|b...]/queue/nr_requests, +# which is typically 128. This default value is a starting point for tuning. You can also run tools/bin/disk_cal.py to +# determine the ideal queue depth for a specific disk. However, capping to the ideal +# queue depth assumes that all TPC IO cores will be fully working during read workloads. If that's not the case, +# you might want to double the ideal queue depth, for example. Exceeding the value used by the Linux IO scheduler (128) +# is never advantageous and will result in higher latency. +# Do _not_ tune. DataStax recommends contacting the DataStax Services team before changing this value. +# io_global_queue_depth: + + +# Enable memory leaks detection. These parameters should not be used unless directed by a support engineer or +# consultant. See "nodetool help leaksdetection" for the documentation. +#leaks_detection_params: +# sampling_probability: 0.01 +# max_stacks_cache_size_mb: 32 +# num_access_records: 0 +# max_stack_depth: 30 diff --git a/kafka-connector-cdc/connector-config.json b/kafka-connector-cdc/connector-config.json new file mode 100644 index 0000000..19889e5 --- /dev/null +++ b/kafka-connector-cdc/connector-config.json @@ -0,0 +1,10 @@ +{ + "name": "dse-connector-json-multi-table-example", + "config": { + "connector.class": "com.datastax.kafkaconnector.source.DseSourceConnector", + "tasks.max": "10", + "topics": "dse-cdc-demo", + "destination": "demo", + "contactPoints": "dse" + } +} diff --git a/kafka-connector-cdc/docker-compose.yml b/kafka-connector-cdc/docker-compose.yml new file mode 100644 index 0000000..65c6d15 --- /dev/null +++ b/kafka-connector-cdc/docker-compose.yml @@ -0,0 +1,169 @@ +--- +version: '2' +services: + + zookeeper: + image: confluentinc/cp-zookeeper:5.3.1 + hostname: zookeeper + container_name: zookeeper + ports: + - "2181:2181" + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + ZOOKEEPER_TICK_TIME: 2000 + + broker: + image: confluentinc/cp-enterprise-kafka:5.3.1 + hostname: broker + container_name: broker + depends_on: + - zookeeper + ports: + - "29092:29092" + - "9092:9092" + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 + KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092 + CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 + CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 + CONFLUENT_METRICS_ENABLE: 'true' + CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous' + + schema-registry: + image: confluentinc/cp-schema-registry:5.3.1 + hostname: schema-registry + container_name: schema-registry + depends_on: + - zookeeper + - broker + ports: + - "8081:8081" + environment: + SCHEMA_REGISTRY_HOST_NAME: schema-registry + SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181' + + connect: + image: confluentinc/cp-kafka-connect-base:5.3.1 + hostname: connect + container_name: connect + depends_on: + - zookeeper + - broker + - schema-registry + ports: + - "8083:8083" + environment: + CONNECT_BOOTSTRAP_SERVERS: 'broker:29092' + CONNECT_REST_ADVERTISED_HOST_NAME: connect + CONNECT_REST_PORT: 8083 + CONNECT_GROUP_ID: compose-connect-group + CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs + CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 + CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets + CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status + CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter + CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter + CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: "false" + CONNECT_INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter + CONNECT_INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter + CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181' + # CLASSPATH required due to CC-2422 + CLASSPATH: /usr/share/java/monitoring-interceptors/monitoring-interceptors-5.3.1.jar + CONNECT_PRODUCER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringProducerInterceptor" + CONNECT_CONSUMER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringConsumerInterceptor" + CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components,/tmp/kafka-connect-dse-2.0.0-20190925-LABS.jar" + CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR + volumes: + - ./kafka-connect-dse-2.0.0-20190925-LABS.jar:/tmp/kafka-connect-dse-2.0.0-20190925-LABS.jar + + control-center: + image: confluentinc/cp-enterprise-control-center:5.3.1 + hostname: control-center + container_name: control-center + depends_on: + - zookeeper + - broker + - schema-registry + - connect + - ksql-server + ports: + - "9021:9021" + environment: + CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' + CONTROL_CENTER_ZOOKEEPER_CONNECT: 'zookeeper:2181' + CONTROL_CENTER_CONNECT_CLUSTER: 'connect:8083' + CONTROL_CENTER_KSQL_URL: "http://ksql-server:8088" + CONTROL_CENTER_KSQL_ADVERTISED_URL: "http://localhost:8088" + CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081" + CONTROL_CENTER_REPLICATION_FACTOR: 1 + CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 + CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 + CONFLUENT_METRICS_TOPIC_REPLICATION: 1 + PORT: 9021 + + ksql-server: + image: confluentinc/cp-ksql-server:5.3.1 + hostname: ksql-server + container_name: ksql-server + depends_on: + - broker + - connect + ports: + - "8088:8088" + environment: + KSQL_CONFIG_DIR: "/etc/ksql" + KSQL_LOG4J_OPTS: "-Dlog4j.configuration=file:/etc/ksql/log4j-rolling.properties" + KSQL_BOOTSTRAP_SERVERS: "broker:29092" + KSQL_HOST_NAME: ksql-server + KSQL_APPLICATION_ID: "cp-all-in-one" + KSQL_LISTENERS: "http://0.0.0.0:8088" + KSQL_CACHE_MAX_BYTES_BUFFERING: 0 + KSQL_KSQL_SCHEMA_REGISTRY_URL: "http://schema-registry:8081" + KSQL_PRODUCER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringProducerInterceptor" + KSQL_CONSUMER_INTERCEPTOR_CLASSES: "io.confluent.monitoring.clients.interceptor.MonitoringConsumerInterceptor" + + ksql-cli: + image: confluentinc/cp-ksql-cli:5.3.1 + container_name: ksql-cli + depends_on: + - broker + - connect + - ksql-server + entrypoint: /bin/sh + tty: true + + rest-proxy: + image: confluentinc/cp-kafka-rest:5.3.1 + depends_on: + - zookeeper + - broker + - schema-registry + ports: + - 8082:8082 + hostname: rest-proxy + container_name: rest-proxy + environment: + KAFKA_REST_HOST_NAME: rest-proxy + KAFKA_REST_BOOTSTRAP_SERVERS: 'broker:29092' + KAFKA_REST_LISTENERS: "http://0.0.0.0:8082" + KAFKA_REST_SCHEMA_REGISTRY_URL: 'http://schema-registry:8081' + + dse: + image: datastaxlabs/dse-cdc-server:6.8.0-20190925 + hostname: dse + container_name: dse + environment: + - DS_LICENSE=accept + volumes: + - ./cassandra.yaml:/config/cassandra.yaml + - ./dse.yaml:/config/dse.yaml + diff --git a/kafka-connector-cdc/dse.yaml b/kafka-connector-cdc/dse.yaml new file mode 100644 index 0000000..9ca21e3 --- /dev/null +++ b/kafka-connector-cdc/dse.yaml @@ -0,0 +1,1118 @@ +# Memory limit for DSE In-Memory tables as a fraction of system memory. When not set, +# the default is 0.2 (20% of system memory). +# Specify max_memory_to_lock_fraction or max_memory_to_lock_mb, not both. + +# max_memory_to_lock_fraction: 0.20 + +# Memory limit for DSE In-Memory tables as a maximum in MB. When not set, +# max_memory_to_lock_fraction is used. The max_memory_to_lock_fraction +# value is ignored if max_memory_to_lock_mb is set to a non-zero value. +# Specify max_memory_to_lock_fraction or max_memory_to_lock_mb, not both. + +# max_memory_to_lock_mb: 10240 + +########################## +# Authentication options +# +# These options are used if the authenticator option in cassandra.yaml is set to +# com.datastax.bdp.cassandra.auth.DseAuthenticator +# +# The enabled option controls whether the DseAuthenticator will authenticate users. If +# set to true users will be authenticated, if set to false they will not. +# When not set enabled is false. +# +# DseAuthenticator allows multiple authentication schemes to be used at the same time. +# The schemes to be used are controlled by the default_scheme and other_schemes options. +# A driver can select the scheme to use during authentication. +# +# The default_scheme option selects which authentication scheme will be used if the driver +# does not request a specific scheme. This can be one of the following values: +# internal - plain text authentication using the internal password authenticator +# ldap - plain text authentication using the passthrough LDAP authenticator +# kerberos - GSSAPI authentication using the Kerberos authenticator +# The other_schemes option is a list of schemes that can also be selected for use by a +# driver and can be a list of the above schemes. +# +# The scheme_permissions option controls whether roles need to have permission granted to +# them in order to use specific authentication schemes. These permissions can be granted +# only when the DseAuthorizer is used. +# +# The allow_digest_with_kerberos option controls whether Digest-MD5 authentication is also +# allowed when Kerberos is one of the authentication schemes. If set to false, it will not +# be allowed. You must set allow_digest_with_kerberos to true in analytics clusters to use Hadoop +# inter-node authentication with Hadoop and Spark jobs. +# +# The plain_text_without_ssl controls how the DseAuthenticator reacts to plain text +# authentication requests over unencrypted client connections. It can be one of: +# block - block the request with an authentication error +# warn - log a warning about the request but allow it to continue +# allow - allow the request without any warning +# +# The transitional_mode option allows the DseAuthenticator to operate in a transitional +# mode during setup of authentication in a cluster. This can be one of the following values: +# disabled - transitional mode is disabled +# permissive - Only super users are authenticated and logged in, all other +# authentication attempts will be logged in as the anonymous user +# normal - If credentials are passed they are authenticated. If the +# authentication is successful then the user is logged in, otherwise +# the user is logged in as anonymous. If no credentials are passed, +# then the user is logged in as anonymous +# strict - If credentials are passed they are authenticated. If the +# authentication is successful, the user is logged in. If the +# authentication fails, an authentication error is returned. If no +# credentials are passed, the user is logged in as anonymous +# authentication_options: +# enabled: false +# default_scheme: internal +# other_schemes: +# scheme_permissions: false +# allow_digest_with_kerberos: true +# plain_text_without_ssl: warn +# transitional_mode: disabled + +########################## +# Role management options +# +# These options are used when the role_manager option in cassandra.yaml is set to +# com.datastax.bdp.cassandra.auth.DseRoleManager +# +# mode can be one of: +# internal - the granting and revoking of roles is managed internally +# using the GRANT ROLE and REVOKE ROLE statements +# ldap - the granting and revoking of roles is managed by an external +# LDAP server configured using the ldap_options. +# role_management_options: +# mode: internal + +########################## +# Authorization options +# +# These options are used if the authorization option in cassandra.yaml is set to +# com.datastax.bdp.cassandra.auth.DseAuthorizer +# +# The enabled option controls whether the DseAuthorizer will perform authorization. If +# set to true authorization is performed, if set to false it is not. +# When not set, enabled is false. +# +# The transitional_mode option allows the DseAuthorizer to operate in a transitional +# mode during setup of authorization in a cluster. This can be one of the following values: +# disabled - transitional mode is disabled, all connections must provide valid credentials and +# map to a login-enabled role +# normal - allow all connections that provide credentials, permissions can be granted to +# resources but are not enforced +# strict - permissions can be granted to resources and are enforced on +# authenticated users. They are not enforced against anonymous +# users +# +# allow_row_level_security - To use row level security, set to true for the entire system. +# Use the same setting on all nodes. +# authorization_options: +# enabled: false +# transitional_mode: disabled +# allow_row_level_security: false + +########################## +# Kerberos options +# +# keytab is /dse.keytab +# The keytab file must contain the credentials for both of the fully resolved principal names, which +# replace _HOST with the fully qualified domain name (FQDN) of the host in the service_principal and +# http_principal settings. The UNIX user running DSE must also have read permissions on the keytab. +# +# The service_principal is the DataStax Enterprise process runs under must use the form +# /_HOST@ +# +# The http_principal is used by the Tomcat application container to run DSE Search. +# +# The qop is the Quality of Protection (QOP) values that clients and servers +# can use for each connection. Valid values are: +# auth - (default) authentication only +# auth-int - authentication plus integity protection of all transmitted data +# auth-conf - authentication plus integrity protection and encryption of all +# transmitted data +# +# Warning - Encryption using auth-conf is separate and completely independent +# of whether encryption is done using SSL. If auth-conf is selected here +# and SSL is enabled, the transmitted data is encrypted twice. +kerberos_options: + keytab: resources/dse/conf/dse.keytab + service_principal: dse/_HOST@REALM + http_principal: HTTP/_HOST@REALM + qop: auth + +########################## +# LDAP options +# +# These are options are only used when the com.datastax.bdp.cassandra.auth.DseAuthenticator +# is configured as the authenticator in cassandra.yaml and 'ldap' scheme is selected in +# authentication_options and/or role_management_options above. + +# ldap_options: +# The host name of the LDAP server. LDAP on the same host (localhost) is appropriate only in +# single node test or development environments. +# server_host: +# +# # The port on which the LDAP server listens, usually port 389 for unencrypted +# # connections and port 636 for SSL-encrypted connections. If use_tls is set to true, use the +# # unencrypted port +# server_port: 389 +# +# # The distinguished name (DN) of an account that is used to search for other users on the +# # LDAP server. This user should have only the necessary permissions to do the search +# # If not present then an anonymous bind is used for the search +# search_dn: +# +# # Password of the search_dn account +# search_password: +# +# # Set to true to use an SSL encrypted connection. In this case the server_port needs +# # to be set to the LDAP port for the server +# use_ssl: false +# +# # Set to true to initiate a TLS encrypted connection on the default ldap port +# use_tls: false +# +# truststore_path: +# truststore_password: +# truststore_type: jks +# user_search_base: +# user_search_filter: (uid={0}) +# +# # Set to the attribute on the user entry containing group membership information. +# user_memberof_attribute: memberof +# +# # The group_search_type defines how group membership will be determined for a user. It +# # can be one of: +# # directory_search - will do a subtree search of group_search_base using +# # group_search_filter to filter the results +# # memberof_search - will get groups from the memberof attribute of the user. This +# # requires the directory server to have memberof support +# group_search_type: directory_search +# group_search_base: +# group_search_filter: (uniquemember={0}) +# +# # The attribute in the group entry that holds the group name. +# group_name_attribute: cn +# +# # Validity period for the credentials cache in milli-seconds (remote bind is an expensive +# # operation). Defaults to 0, set to 0 to disable. +# credentials_validity_in_ms: 0 +# +# # Validity period for the search cache in seconds. Defaults to 0, set to 0 to disable. +# search_validity_in_seconds: 0 +# +# connection_pool: +# max_active: 8 +# max_idle: 8 + +# To ensure that records with TTLs are purged from DSE Search indexes when they expire, DSE +# periodically checks all indexes for expired documents and deletes them. These settings +# control the scheduling and execution of those checks. +ttl_index_rebuild_options: + + # By default, schedule a check every 300 seconds: + fixed_rate_period: 300 + + # The number of seconds to delay the first check to speed up startup time: + initial_delay: 20 + + # All documents determined to be expired are deleted from the index during each check, but + # to avoid memory pressure, their unique keys are retrieved and deletes issued in batches. + # This determines the maximum number of documents per batch: + max_docs_per_batch: 4096 + + # Maximum number of search indexes that can execute TTL cleanup concurrently: + thread_pool_size: 1 + +# DSE Search resource upload size limit in MB. A value of '0' disables resource uploading. +solr_resource_upload_limit_mb: 10 + +# Transport options for inter-node communication between DSE Search nodes. +shard_transport_options: + # The cumulative shard request timeout, in milliseconds, defines the internal timeout for all + # search queries to prevent long running queries. Default is 60000 (1 minute). + netty_client_request_timeout: 60000 + +# ---- DSE Search index encryption options + +# solr_encryption_options: +# # Whether to allocate shared index decryption cache off JVM heap. +# # Default is off heap allocation (true). +# decryption_cache_offheap_allocation: true + +# # The maximum size of shared DSE Search decryption cache, in MB. +# # Default is 256 MB. +# decryption_cache_size_in_mb: 256 + +# ---- DSE Search indexing settings + +# # The maximum number of queued partitions during search index rebuilding. (This serves primarily +# # as a safeguard against excessive heap usage by the indexing queue.) If set lower than the +# # number of TPC threads, not all TPC threads can be actively indexing. +# # +# # Default: 1024 +# back_pressure_threshold_per_core: 1024 +# +# # The max time to wait for flushing of index updates during re-index. +# # Flushing should always complete successfully, in order to fully sync search indexes +# # with DSE data. DataStax recommends to always set at a reasonably high value. +# # +# # Default: 5 minutes +# flush_max_time_per_core: 5 +# +# # The maximum time to wait for each search index to load on startup and create/reload search index operations. +# # Only change this advanced option if any exceptions happen during search index loading. +# # +# # Default: 5 minutes +# load_max_time_per_core: 5 +# +# # Applies the configured Cassandra disk failure policy to index write failures. +# # Default is disabled (false). +# enable_index_disk_failure_policy: false + +# # The directory to store search index data. Each DSE Search index is stored under +# # a solrconfig_data_dir/keyspace.table directory. +# # Default is a solr.data directory inside Cassandra data directory, or as specified +# # by the dse.solr.data.dir system property. +# solr_data_dir: /MyDir + +# # The Lucene field cache has been deprecated. Instead set docValues="true" on the field +# # in the schema.xml file. After changing the schema, reload and reindex the search index. +# # Default: false +# solr_field_cache_enabled: false + +# # Global Lucene RAM buffer usage thresholds (separate for heap and off-heap) at which DSE will force segment flush. +# # Setting this too low may induce a state of constant flushing during periods of ongoing write activity. For +# # NRT, these forced segment flushes will also de-schedule pending auto-soft commits to avoid potentially +# # flushing too many small segments. +# # Default: 1024 +# ram_buffer_heap_space_in_mb: 1024 +# # Default: 1024 +# ram_buffer_offheap_space_in_mb: 1024 + +# ---- DSE Search CQL query options + +# # Maximum time in milliseconds to wait for all rows +# # to be read from the database during CQL Solr queries. +# # Default is 10000 (10 seconds). +# cql_solr_query_row_timeout: 10000 + +########################## +# Global performance service options + +# # Number of background threads used by the performance service under normal conditions. +# # Defaults to 4. +# performance_core_threads: 4 +# # Maximum number of background threads used by the performance service. +# # Defaults to concurrent_writes specified in cassandra.yaml. +# performance_max_threads: 32 +# +# # The number of queued tasks in the backlog when the number of performance_max_threads are busy (minimum 0). +# performance_queue_capacity: 32000 +# +# # If the performance service requests more tasks than (performance_max_threads + performance_queue_capacity), +# # a dropped task warning will be issued. This warning indicates that collected statistics may not be up to date +# # because the server couldn't keep up under the current load. +# +# # You can disable some services, reconfigure some services, or increase the queue size. + +########################## +# Performance service options + +graph_events: + ttl_seconds: 600 + +# cql_slow_log_options: +# enabled: true +# +# # When t > 1, log queries taking longer than t milliseconds. +# # 0 <= t <= 1, log queries above t percentile +# threshold: 200.0 +# +# # Initial number of queries before percentile filter becomes active +# minimum_samples: 100 +# +# ttl_seconds: 259200 +# +# # Keeps slow queries in-memory only and doesn't write data to the database. +# # WARNING - if this is set to 'false' then set threshold >= 2000, otherwise there will be a +# # high load on the database. +# skip_writing_to_db: true +# +# # The number of slow queries to keep in-memory +# num_slowest_queries: 5 + +cql_system_info_options: + enabled: false + refresh_rate_ms: 10000 + +resource_level_latency_tracking_options: + enabled: false + refresh_rate_ms: 10000 + +db_summary_stats_options: + enabled: false + refresh_rate_ms: 10000 + +cluster_summary_stats_options: + enabled: false + refresh_rate_ms: 10000 + +spark_cluster_info_options: + enabled: false + refresh_rate_ms: 10000 + +# ---- Spark application stats options +spark_application_info_options: + enabled: false + refresh_rate_ms: 10000 + + driver: + # enables or disables writing of the metrics collected at Spark Driver to Cassandra + sink: false + + # enables or disables Spark Cassandra Connector metrics at Spark Driver + connectorSource: false + + # enables or disables JVM heap and GC metrics at Spark Driver + jvmSource: false + + # enables or disables application state metrics + stateSource: false + + executor: + # enables or disables writing of the metrics collected at executors to Cassandra + sink: false + + # enables or disables Spark Cassandra Connector metrics at executors + connectorSource: false + + # enables or disables JVM heap and GC metrics at executors + jvmSource: false + +# Table Histogram data tables options +histogram_data_options: + enabled: false + refresh_rate_ms: 10000 + retention_count: 3 + +# User/Resource latency tracking settings +user_level_latency_tracking_options: + enabled: false + refresh_rate_ms: 10000 + top_stats_limit: 100 + quantiles: false + +# ---- DSE Search Performance Objects + +solr_slow_sub_query_log_options: + enabled: false + ttl_seconds: 604800 + async_writers: 1 + threshold_ms: 3000 + +solr_update_handler_metrics_options: + enabled: false + ttl_seconds: 604800 + refresh_rate_ms: 60000 + +solr_request_handler_metrics_options: + enabled: false + ttl_seconds: 604800 + refresh_rate_ms: 60000 + +solr_index_stats_options: + enabled: false + ttl_seconds: 604800 + refresh_rate_ms: 60000 + +solr_cache_stats_options: + enabled: false + ttl_seconds: 604800 + refresh_rate_ms: 60000 + +solr_latency_snapshot_options: + enabled: false + ttl_seconds: 604800 + refresh_rate_ms: 60000 + +# Node health is a score-based representation of how fit a node is to handle queries. The score is a +# function of how long a node has been up and the rate of dropped mutations in the recent past. +node_health_options: + refresh_rate_ms: 60000 + # The amount of continuous uptime required for the node to reach the maximum uptime score. If you + # are concerned with consistency during repair after a period of downtime, you may want to + # temporarily increase this time to the expected time it will take to complete repair. + # + # Default - 10800 seconds (3 hours) + uptime_ramp_up_period_seconds: 10800 + # The time window in the past over which the rate of dropped mutations affects the node health score. + # Default - 30 minutes + dropped_mutation_window_minutes: 30 + +# If enabled (true), replica selection for distributed DSE Search queries takes node health into account +# when multiple candidates exist for a particular token range. Set to false to ignore +# node health when choosing replicas. +# +# Health-based routing allows us to make a trade-off between index consistency and query throughput. If +# the primary concern is query performance, it may make sense to set this to "false". +# +# Default is enabled (true). +enable_health_based_routing: true + +# If enabled (true), DSE Search reindexing of bootstrapped data will happen asynchronously, and the node will join the ring straight +# after bootstrap. +# +# Default is disabled (false). The node will wait for reindexing of bootstrapped data to finish before joining the ring. +async_bootstrap_reindex: false + +# Lease metrics. Enable these metrics to help monitor the performance of the lease subsystem. +# ttl_seconds controls how long the log of lease holder changes persists. +lease_metrics_options: + enabled: false + ttl_seconds: 604800 + +# The directory where system keys are kept. +# +# Keys used for SSTable encryption must be distributed to all nodes. +# DSE must be able to read and write to the directory. +# +# This directory should have 700 permissions and belong to the dse user. +system_key_directory: /etc/dse/conf + +# If this is set to true, DSE requires the following config values to be encrypted: +# resources/cassandra/conf/cassandra.yaml: +# server_encryption_options.keystore_password +# server_encryption_options.truststore_password +# client_encryption_options.keystore_password +# client_encryption_options.truststore_password +# resources/dse/conf/dse.yaml: +# ldap_options.search_password +# ldap_options.truststore_password +# +# It's an error if the passwords aren't encrypted. +# Config values can be encrypted with "dsetool encryptconfigvalue" +config_encryption_active: false + +# The name of the system key used to encrypt / decrypt passwords stored +# in configuration files. +# +# If config_encryption_active is true, it's an error if a valid key with +# this name isn't in the system key directory keyfiles, and KMIP managed +# keys can be created with "dsetool createsystemkey" +config_encryption_key_name: system_key + +########################## +# Spark-related settings + +# The length of a shared secret used to authenticate Spark components and encrypt the connections between them. +# Note that this is not the strength of the cipher used for encrypting connections. +spark_shared_secret_bit_length: 256 + +# Enables Spark security based on shared secret infrastructure. Enables mutual authentication between Spark master +# and worker nodes. If DSE authentication is enabled, spark security is forced to be enabled and this parameter is ignored. +spark_security_enabled: false + +# Enables encryption between Spark master and worker nodes, except Web UI. The connection uses the +# Digest-MD5 SASL-based encryption mechanism. This option applies only if spark_security_enabled is true. +# If DSE authentication is enabled, spark security encryption is forced to be enabled and this parameter is ignored. +spark_security_encryption_enabled: false + +# # How often Spark plugin should check for Spark Master / Spark Worker readiness to start. The value is +# # a time (in ms) between subsequent retries. +# spark_daemon_readiness_assertion_interval: 1000 + +# +# Legacy Resource Manager options +# +# Controls the physical resources that can be used by Spark applications on this node. +# cores_total is the number of cores and and memory_total is total system memory that you can assign to all executors +# that are run by the work pools on this node. The values can be absolute (exact number of cores) or the +# memory size (use metric suffixes like M for mega, and G for giga) or a fraction of physical cores reported by the OS, +# and fraction of available memory, where available memory is calculated as: total physical memory - DSE max heap size. +# cores_total and memory_total replace initial_spark_worker_resources option which was used in earlier DSE versions. +# The default 0.7 for cores and memory corresponds to the default value of initial_spark_worker_resources 0.7. +# DSE does not support setting Spark Worker cores and memory through environment variables SPARK_WORKER_CORES +# and SPARK_WORKER_MEMORY. +# resource_manager_options: +# worker_options: +# cores_total: 0.7 +# memory_total: 0.6 +# +# workpools: +# - name: alwayson_sql +# cores: 0.25 +# memory: 0.25 + +# In DSE 5.1 and later: Communication between Spark applications and the resource manager are routed through +# the CQL native protocol. Enabling client encryption in cassandra.yaml will also enable encryption for +# the communication with the DSE Spark Master. To secure the communication between Spark Driver and Spark Executors, +# enable Spark authentication and encryption for that application. +# In contrast, mutual authentication and encryption of communication between DSE Spark Master and Workers are +# managed by spark_security_enabled and spark_security_encryption_enabled in dse.yaml. + +# Spark UI options apply to Spark Master and Spark Worker UIs and to Spark daemon UIs in general. Spark UI options do NOT +# apply to user applications even if they run in cluster mode. +spark_ui_options: + # Valid values are: + # inherit - SSL settings are inherited from DSE client encryption options + # custom - SSL settings from encryption_options below + encryption: inherit + + encryption_options: + enabled: false + keystore: resources/dse/conf/.ui-keystore + keystore_password: cassandra + # require_client_auth: false + # Set trustore and truststore_password if require_client_auth is true + # truststore: resources/dse/conf/.ui-truststore + # truststore_password: cassandra + # More advanced defaults: + # protocol: TLS + # algorithm: SunX509 + # + # replaces the deprecated store_type for keystore, valid types can be JKS, JCEKS, PKCS12 or PKCS11 + # for file based keystores prefer PKCS12 + # keystore_type: JKS + # + # replaces the deprecated store_type for truststore, valid types can be JKS, JCEKS, PKCS12 or PKCS11 + # for file based keystores prefer PKCS12 + # truststore_type: JKS + # + # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA] + +# Configure how the driver and executor processes are created and managed. +spark_process_runner: + # Valid options are: default, run_as + runner_type: default + + # DSE uses sudo to run Spark application components (drivers and executors) as specific OS users. + # A set of predefined users, called slot users, is used for this purpose. All drivers and executors + # owned by some DSE user are run as some slot user x. Drivers and executors of any other DSE user + # use different slots. + # Setting up slots: + # 1. Create n users (n = number of slots), call them slot1, slot2, ..., slotn, with no login. Each user + # should have primary group the same as its name, so for example slot1:slot1, slot2:slot2, ... + # 2. Add DSE service user (the user who runs DSE server) to the slot user groups; the DSE service user must be + # in all slot user groups. + # 3. Modify the sudoers files so that: + # a) DSE service user can execute any command as any slot user without providing a password + # b) umask is overridden to 007 for those commands so that files created by sub-processes will not be accessible + # by anyone by default, + # For example, if we have two slot users slot1, slot2, and DSE service user dse, add these slot users to sudoers: + # Runas_Alias SLOTS = slot1, slot2 + # Defaults>SLOTS umask=007 + # Defaults>SLOTS umask_override + # dse ALL=(SLOTS) NOPASSWD: ALL + run_as_runner_options: + user_slots: + - slot1 + - slot2 + +# AlwaysOn SQL options have dependence on workpool setting of resource_manager_options. Set workpool configuration if you +# enable alwayson_sql_options. +# alwayson_sql_options: +# # Set to true to enable the node for AlwaysOn SQL. Only an Analytics node +# # can be enabled as an AlwaysOn SQL node. +# enabled: false +# +# # AlwaysOn SQL Thrift port +# thrift_port: 10000 +# +# # AlwaysOn SQL WebUI port +# web_ui_port: 9077 +# +# # The waiting time to reserve the Thrift port if it's not available +# reserve_port_wait_time_ms: 100 +# +# # The waiting time to check AlwaysOn SQL health status +# alwayson_sql_status_check_wait_time_ms: 500 +# +# # The work pool name used by AlwaysOn SQL +# workpool: alwayson_sql +# +# # Location in DSEFS of the log files +# log_dsefs_dir: /spark/log/alwayson_sql +# +# # The role to use for internal communication by AlwaysOn SQL if authentication is enabled +# auth_user: alwayson_sql +# +# # The maximum number of errors that can occur during AlwaysOn SQL service runner thread +# # runs before stopping the service. A service stop requires a manual restart. +# runner_max_errors: 10 +# +# # The interval in seconds to update heartbeat of AlwaysOn SQL. If heartbeat is not updated +# # for more than the period of three times of the interval, AlwaysOn SQL malfunctions. +# # AlwaysOn SQL automatically restarts. +# heartbeat_update_interval_seconds: 30 + +########################## +# DSE File System (DSEFS) options +# dsefs_options: +# +# # Whether to enable DSEFS on this node. +# # If not set, DSEFS is enabled only on the nodes that run a Spark workload. +# enabled: true +# +# # The keyspace where the DSEFS metadata is stored. Optionally configure multiple DSEFS file systems +# # within a cluster by specifying a different keyspace name for each datacenter. +# keyspace_name: dsefs +# +# # The local directory for storing the local node metadata, including the node identifier. +# # The amount of data stored is nominal, and does not require configuration for throughput, latency, or capacity. +# # This directory must not be shared by DSEFS nodes. +# work_dir: /var/lib/dsefs +# +# # The public port on which DSEFS listens for clients. The service on this port is bound to +# # native_transport address. +# public_port: 5598 +# +# # Port for inter-node communication, must be not visible from outside of the cluster. +# # It is bound to listen address. Do not open this port to firewalls. +# private_port: 5599 +# +# # Mandatory attribute to identify the set of directories. DataStax recommends segregating these data directories +# # on physical devices that are different from the devices that are used for the DSE database. +# # Using multiple directories on JBOD improves performance and capacity. +# data_directories: +# - dir: /var/lib/dsefs/data +# +# # The weighting factor for this location specifies how much data to place in this directory, relative to +# # other directories in the cluster. This soft constraint determines how DSEFS distributes the data. +# storage_weight: 1.0 +# +# # Reserved space (in bytes) that is not going to be used for storing blocks +# min_free_space: 268435456 +# +# # More advanced settings: +# +# # Wait time before the DSEFS server times out while waiting for services to bootstrap. +# service_startup_timeout_ms: 600000 +# +# # Wait time before the DSEFS server times out while waiting for services to close. +# service_close_timeout_ms: 600000 +# +# # Wait time that the DSEFS server waits during shutdown before closing all pending connections. +# server_close_timeout_ms: 2147483647 # Integer.MAX_VALUE +# +# # The maximum accepted size of a compression frame defined during file upload. +# compression_frame_max_size: 1048576 +# +# # Maximum number of elements in a single DSEFS Server query cache. DSEFS reuses this value for every cache that +# # stores database query results. +# query_cache_size: 2048 +# +# # The time to retain the DSEFS Server query cache element in cache. The cache element expires +# # when this time is exceeded. +# query_cache_expire_after_ms: 2000 +# +# internode_authentication: +# # If enabled, the servers are obliged to authenticate all messages passed between them on private_port. +# # The authentication protocol is based on HMAC used with a pre-shared secret available only to DSE cluster +# # members (nodes). +# # The actual key is never passed between the nodes. +# # Typically there is no need to turn this authentication off and it doesn't incur any performance overhead. +# # Disabling internode authentication is not recommended, but may be used for debugging purposes +# # to issue internode requests manually with curl. +# # Limitations: +# # Beware that enabling internode authentication does not encrypt the internode traffic. +# # Only HTTP headers are protected with HMAC, so MITM attacks are still possible on the message data. +# # It is also possible to bypass the authentication if the DSE messaging subsystem was not +# # properly secured and the attacker could fake being a part of the DSE cluster in order to obtain +# # the secret key. If you need stronger security, please configure SSL. +# enabled: true +# +# # Algorithm used for key encryption: +# algorithm: HmacSHA256 +# +# gossip_options: +# # The delay between gossip rounds +# round_delay_ms: 2000 +# +# # How long to wait after registering the Location and reading back all other Locations from the database +# startup_delay_ms: 5000 +# +# # How long to wait after announcing shutdown before shutting down the node +# shutdown_delay_ms: 10000 +# +# rest_options: +# # How long RestClient is going to wait for a response corresponding to a given request +# request_timeout_ms: 330000 +# +# # How long RestClient is going to wait for establishing a new connection +# connection_open_timeout_ms: 10000 +# +# # How long RestClient is going to wait until all pending transfers are complete before closing +# client_close_timeout_ms: 60000 +# +# # How long to wait for the server rest call to complete +# server_request_timeout_ms: 300000 +# +# # Wait time, in milliseconds, before closing idle RestClient - server connection. 0 if disabled. +# # If RestClient does not close connection after this timeout, the server closes the connection after +# # 2 * idle_connection_timeout_ms milliseconds. +# idle_connection_timeout_ms: 60000 +# +# # Wait time, in milliseconds, before closing idle internode connection. The internode connections are +# # mainly used to exchange data during replication. Do not set lower than the default value for heavily +# # utilized DSEFS clusters. +# internode_idle_connection_timeout_ms: 120000 +# +# # Maximum number of connections to a given host per single CPU core. DSEFS keeps a connection pool for +# # each CPU core. +# core_max_concurrent_connections_per_host: 8 +# +# transaction_options: +# # How long to allow a transaction to run before considering it for timing out and rollback +# transaction_timeout_ms: 60000 +# +# # How long to wait before retrying a transaction aborted due to a conflict +# conflict_retry_delay_ms: 10 +# +# # How many times the transaction is retried in case of a conflict before giving up +# conflict_retry_count: 40 +# +# # How long to wait before retrying a failed transaction payload execution +# execution_retry_delay_ms: 1000 +# +# # How many times to retry executing the payload before signaling the error to the application +# execution_retry_count: 3 +# +# block_allocator_options: +# # The overflow_margin_mb and overflow_factor options control how much additional data can be placed +# # on the local (coordinator) before the local node overflows to the other nodes. +# # A local node is preferred for a new block allocation, if +# # used_size_on_the_local_node < average_used_size_per_node * overflow_factor + overflow_margin. +# # The trade-off is between data locality of writes and balancing the cluster. +# # To disable the preference for allocating blocks on the coordinator node, set these values to 0 MB and 1.0. +# overflow_margin_mb: 1024 +# overflow_factor: 1.05 + +# Insightful Monitoring(Insights) Options +# enable insights_options. +# insights_options: +# # Directory to store insights +# data_dir: /var/lib/cassandra/insights_data +# +# # Directory to store insight logs +# log_dir: /var/log/cassandra/ + +########################## +# Audit logging options +audit_logging_options: + enabled: false + + # The logger used for logging audit information + # Available loggers are: + # CassandraAuditWriter - logs audit info to a cassandra table. This logger can be run synchronously or + # asynchronously. Audit logs are stored in the dse_audit.audit_log table. + # When run synchronously, a query will not execute until it has been written + # to the audit log table successfully. If a failure occurs before an audit event is + # written, and it's query is executed, the audit logs might contain queries that were never + # executed. + # SLF4JAuditWriter - logs audit info to an SLF4J logger. The logger name is `SLF4JAuditWriter`, + # and can be configured in the logback.xml file. + logger: SLF4JAuditWriter + +# # Comma-separated list of audit event categories to be included or excluded from the audit log. +# # When not set, the default includes all categories. +# # Categories are: QUERY, DML, DDL, DCL, AUTH, ADMIN, ERROR. +# # Specify either included or excluded categories. Specifying both is an error. +# included_categories: +# excluded_categories: + +# # Comma-separated list of keyspaces to be included or excluded from the audit log. +# # When not set, the default includes all keyspaces. +# # Specify either included or excluded keyspaces. Specifying both is an error. +# included_keyspaces: +# excluded_keyspaces: + +# # Comma separated list of the roles to be audited or not. +# # Specify either included or excluded roles. Specifying both is an error +# included_roles: +# excluded_roles: + + # The amount of time, in hours, audit events are retained by supporting loggers. + # Only the CassandraAuditWriter supports retention time. + # Values of 0 or less retain events forever. + retention_time: 0 + + cassandra_audit_writer_options: + # Sets the mode the audit writer runs in. + # + # When run synchronously, a query is not executed until the audit event is successfully written. + # + # When run asynchronously, audit events are queued for writing to the audit table, but are + # not necessarily logged before the query executes. A pool of writer threads consumes the + # audit events from the queue, and writes them to the audit table in batch queries. While + # this substantially improves performance under load, if there is a failure between when + # a query is executed, and it's audit event is written to the table, the audit table may + # be missing entries for queries that were executed. + # valid options are 'sync' and 'async' + mode: sync + + # The maximum number of events the writer will dequeue before writing them out to the table. + # If you're seeing warnings in your logs about batches being too large, decrease this value. + # Increasing guardrails.batch_size_warn_threshold_in_kb in cassandra.yaml is also an option, but make sure you understand + # the implications before doing so. + # + # Only used in async mode. Must be >0 + batch_size: 50 + + # The maximum amount of time in milliseconds an event will be dequeued by a writer before being written out. This + # prevents events from waiting too long before being written to the table when there's not a lot of queries happening. + # + # Only used in async mode. Must be >0 + flush_time: 250 + + # The size of the queue feeding the asynchronous audit log writer threads. When there are more events being + # produced than the writers can write out, the queue will fill up, and newer queries will block until there + # is space on the queue. + # If a value of 0 is used, the queue size will be unbounded, which can lead to resource exhaustion under + # heavy query load. + queue_size: 30000 + + # the consistency level used to write audit events + write_consistency: QUORUM + +# # Where dropped events are logged +# dropped_event_log: /var/log/cassandra/dropped_audit_events.log + +# # Partition days into hours by default +# day_partition_millis: 3600000 + +########################## +# System information encryption settings +# +# If enabled, system tables that might contain sensitive information (system.batchlog, +# system.paxos), hints files, and Cassandra commit logs are encrypted with these +# encryption settings. +# +# If DSE Search index encryption is enabled, DSE Search index files are also encrypted with these settings. +# If backing C* table encryption is enabled, DSE Search commit log is encrypted with these settings. +# +# When enabling system table encryption on a node with existing data, run +# `nodetool upgradesstables -a` on the listed tables to encrypt existing data. +# +# When tracing is enabled, sensitive information is written to the tables in the +# system_traces keyspace. Configure encryption on the tables to encrypt their data +# on disk by using an encrypting compressor. +# +# DataStax recommends using remote encryption keys from a KMIP server when using Transparent Data Encryption (TDE) features. +# Local key support is provided when a KMIP server is not available. +system_info_encryption: + enabled: false + cipher_algorithm: AES + secret_key_strength: 128 + chunk_length_kb: 64 + +# # The encryptor will use a KMIP key server to manage its encryption keys. Specify only to use a KMIP key server, +# # otherwise omit this entry. The default is to use local key encryption. +# key_provider: KmipKeyProviderFactory + +# # If KmipKeyProviderFactory is used for system_info_encryption, this specifies the kmip host to be used. +# kmip_host: kmip_host_name + +########################## +# KMIP hosts options +# +# Connection settings for key servers supporting the KMIP protocol +# allow DSE encryption features to use encryption and decryption keys that are not stored +# on the same machine running DSE. +# +# Hosts are configured as : {connection_settings}, which maps a user-defined +# name to a set of KMIP hosts and KMIP-defined credentials (keystores and truststores) that are used with a particular +# key server. This name is then used when referring to KMIP hosts. DSE supports multiple KMIP hosts. + +# kmip_hosts: +# # The unique name of this KMIP host/cluster which is specified in the table schema. +# host.yourdomain.com: +# +# # Comma-separated list of KMIP hosts host[:port] +# # The current implementation of KMIP connection management supports only failover, so all requests will +# # go through a single KMIP server. There is no load balancing. This is because there aren't many known KMIP servers +# # that support read replication, or other strategies for availability. +# # +# # Hosts are tried in the order they appear, so add KMIP hosts in the intended failover sequence. +# hosts: kmip1.yourdomain.com, kmip2.yourdomain.com +# +# # keystore/truststore info +# keystore_path: /path/to/keystore.jks +# keystore_type: jks +# keystore_password: password +# +# truststore_path: /path/to/truststore.jks, +# truststore_type: jks +# truststore_password: password +# +# # The time that keys read from the KMIP hosts are cached locally. +# # The longer keys are cached, the fewer requests are made to the key server. However, also sets the time +# # for changes (ie: revocation) to propagate to the DSE node. +# key_cache_millis: 300000 +# +# # Socket timeout in milliseconds. +# timeout: 1000 + +# # driver - DSE Search will use Solr cursor paging (deep paging) when pagination is enabled by the CQL driver. +# # +# # off - DSE Search will ignore the driver's pagination settings and use normal Solr paging unless: +# # - The current workload is an analytics workload (ex. SearchAnalytics). +# # - The query parameter 'paging' is set to 'driver'. +# # +# # Default is 'off' +# # +# cql_solr_query_paging: off + +# Local settings for tiered storage +# +# Tiered storage supports multiple disk configurations that are configured as : , and specified in DDL. +# The tiers themselves are unnamed, and are just collections of paths that must be defined in the order they're to be used. +# Typically, put your fastest storage in the top tier, and go down from there. +# +# Storage configurations don't need to be homogenous across the cluster, and internally, each node will use only the +# the number of tiers it has configured, or the number of tiers configured to be used in the DDL, whichever is less. +# +# Although the behavior of the tiered storage strategy for a given table is configured in the DDL, these settings can +# be overridden locally, per node, by specifying 'local_options' : {:, ...} in a table schema. This can be useful for testing +# options before deploying cluster wide, or for storage configurations which don't map cleanly to the DDL configuration. +# +# tiered_storage_options: +# strategy1: +# tiers: +# - paths: +# - /mnt1 +# - /mnt2 +# - paths: [ /mnt3, /mnt4 ] +# - paths: [ /mnt5, /mnt6 ] +# +# local_options: +# k1: v1 +# k2: v2 +# +# 'another strategy': +# tiers: [ paths: [ /mnt1 ] ] + +########################## +# DSE Advanced Replication configuration settings +# +# DSE Advanced replication supports one-way distributed data replication from remote +# clusters (source clusters) to central data hubs (destination clusters). +# +advanced_replication_options: + enabled: true +# # Whether to enable driver password encryption. Driver passwords are stored in a CQL table. +# # DataStax recommends encrypting the driver passwords before you add them to the CQL table. +# # By default, driver user names and passwords are plain text. When true, the configured passwords +# # (including Cassandra password, SSL keystore/truststore password, etc.) that are stored in the +# # advrep config must be encrypted and generated as system keys. Each node in the source cluster must have the same +# # encryption/decryption key. The destination cluster does not require this key. + +# conf_driver_password_encryption_enabled: false + +# # The directory to hold advanced replication log files. + advanced_replication_directory: /var/lib/cassandra/advrep + +# # The base path that will be prepended to paths in the Advanced Replication +# # configuration locations, including locations to SSL keystore, SSL truststore, etc. +# security_base_path: /base/path/to/advrep/security/files/ + +########################## +# These internode_messaging_options configure network services for internal communication +# for all nodes. These settings must be identical on all nodes in the cluster. +internode_messaging_options: + # TCP listen port (mandatory) + port: 8609 + +# # Maximum message frame length. If not set, the default is 256 MB. +# frame_length_in_mb: 256 + +# # Number of server acceptor threads. If not set, the default is the number of available processors. +# server_acceptor_threads: 8 + +# # Number of server worker threads. If not set, the default is the number of available processors * 8. +# server_worker_threads: 16 + +# # Maximum number of client connections. If not set, the default is 100. +# client_max_connections: 100 + +# # Number of client worker threads. If not set, the default is the number of available processors * 8. +# client_worker_threads: 16 + +# # Timeout for communication handshake process. If not set, the default is 10 seconds. +# handshake_timeout_seconds: 10 + +# # Client request timeout. If not set, the default is 60 seconds. +# client_request_timeout_seconds: 60 + +########################## +# Graph configuration +# Contains all system-level configuration options and those shared between graph +# instances. +# graph: + # Maximum time to wait for an OLAP analytic (Spark) traversal to evaluate. + # When not set, the default is 10080 minutes (168 hours). + # analytic_evaluation_timeout_in_minutes: 10080 + + # Maximum time to wait for an OLTP real-time traversal to evaluate. + # When not set, the default is 30 seconds. + # realtime_evaluation_timeout_in_seconds: 30 + + # Maximum time to wait for the database to agree on schema versions before timing + # out. When not set, the default is 10000 ms (10 seconds). + # schema_agreement_timeout_in_ms: 10000 + + # Maximum time to wait for a graph-system request to evaluate. Creating a new + # graph is an example of a graph-system request. + # When not set, the default is 180 seconds. + # system_evaluation_timeout_in_seconds: 180 + + # The amount of memory (RAM) to allocate to each graph's adjacency (edge and property) + # cache. When not set, the default is 128. Value: integer. + # adjacency_cache_size_in_mb: 128 + + # The amount of memory (RAM) to allocate to the index cache. Value: integer. + # When not set, the default is 128. Value: integer. + # index_cache_size_in_mb: 128 + + # The maximum number of parameters that can be passed on a graph query request for both TinkerPop drivers + # and those using the Cassandra native protocol. Generally speaking, it is considered an anti-pattern to + # pass "massive" numbers of parameters on requests, as it increases the script evaluation time. Consider + # other methods for parameterizing scripts (like passing a single Map or List if many arguments are needed) + # before you increase this value. + # max_query_params: 16 + +# gremlin_server: + # port: 8182 + + # Size of the worker thread pool. Should generally not exceed 2 * number of cores. + # A worker thread performs non-blocking read and write for one or more Channels. + # threadPoolWorker: 2 + + # The number of "Gremlin" threads available to execute scripts in a ScriptEngine as well as bytecode requests. + # This pool represents the workers available to handle blocking operations in Gremlin Server. When unset or set to zero, + # this value will be defaulted to 10 times the value of the JVM property "cassandra.available_processors" (if set) + # or to 10 times the value of Runtime.getRuntime().availableProcessors() (otherwise). + # gremlinPool: 0 + +# # The gremlin-groovy script engine will always be added even if the configuration option is not present. +# # Additional imports may be added in the configuration for that script engine. +# scriptEngines: +# gremlin-groovy: +# config: +# # To disable the gremlin groovy sandbox entirely +# sandbox_enabled: false +# sandbox_rules: +# +# # To completely whitelist a package add the package name here +# whitelist_packages: +# - package.name +# +# # To whitelist an individual type add the name of the type here +# whitelist_types: +# - fully.qualified.class.name +# +# # To whitelist a super class add the name of the type here +# whitelist_supers: +# - fully.qualified.class.name + +