Skip to content

Commit

Permalink
better job profile collection defaults (#228)
Browse files Browse the repository at this point in the history
* when doing light and health check and the pat is added will collect 20
  profiles
* moved rest collection except job profiles to front of collection to
  avoid bearer token expiration at the end of the job
  • Loading branch information
rsvihladremio authored Mar 28, 2024
1 parent 5fa7559 commit f2e6d33
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 28 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## [2.3.0] - 2024-03-28

### Changed

* the default job profiles is now 20 for everything except the health check where it remains 25000. This default only works if
someone has added the PAT which is always available
* moved all rest API calls to the start of the process. This is to minimize the amount of operations that fails if a token expires.

## [2.3.0-rc3] - 2024-03-26

### Fixed
Expand Down Expand Up @@ -599,6 +607,7 @@

- able to capture logs, configuration and diagnostic data from dremio clusters deployed on Kubernetes and on-prem

[2.3.0]: https://github.com/dremio/dremio-diagnostic-collector/compare/v2.3.0-rc3...v2.3.0
[2.3.0-rc3]: https://github.com/dremio/dremio-diagnostic-collector/compare/v2.3.0-rc2...v2.3.0-rc3
[2.3.0-rc2]: https://github.com/dremio/dremio-diagnostic-collector/compare/v2.3.0-rc1...v2.3.0-rc2
[2.3.0-rc1]: https://github.com/dremio/dremio-diagnostic-collector/compare/v2.3.0-beta3...v2.3.0-rc1
Expand Down
6 changes: 5 additions & 1 deletion cmd/local/conf/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ func SetViperDefaults(confData map[string]interface{}, hostName string, defaultC
setDefault(confData, KeyDremioQueriesJSONNumDays, 30)
setDefault(confData, KeyNumberThreads, 2)
}
if collectionMode == collects.HealthCheckCollection {
setDefault(confData, KeyNumberJobProfiles, 25000)
} else {
setDefault(confData, KeyNumberJobProfiles, 20)
}

// set default config
setDefault(confData, KeyCollectJStack, false)
Expand All @@ -59,7 +64,6 @@ func SetViperDefaults(confData map[string]interface{}, hostName string, defaultC
setDefault(confData, KeyDremioRocksdbDir, "/opt/dremio/data/db")
setDefault(confData, KeyCollectDremioConfiguration, true)
setDefault(confData, KeyCaptureHeapDump, false)
setDefault(confData, KeyNumberJobProfiles, 25000)
setDefault(confData, KeyDremioEndpoint, "http://localhost:9047")
setDefault(confData, KeyTarballOutDir, "/tmp/ddc")
setDefault(confData, KeyCollectOSConfig, true)
Expand Down
126 changes: 122 additions & 4 deletions cmd/local/conf/defaults_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ import (
"github.com/dremio/dremio-diagnostic-collector/pkg/collects"
)

func setupTestSetViperDefaults() (map[string]interface{}, string, int) {
func setupTestSetViperDefaults(collectionType string) (map[string]interface{}, string, int) {
hostName := "test-host"
defaultCaptureSeconds := 30
confData := make(map[string]interface{})
// Run the function.
conf.SetViperDefaults(confData, hostName, defaultCaptureSeconds, collects.StandardCollection)
conf.SetViperDefaults(confData, hostName, defaultCaptureSeconds, collectionType)

return confData, hostName, defaultCaptureSeconds
}

func TestSetViperDefaults(t *testing.T) {
confData, hostName, defaultCaptureSeconds := setupTestSetViperDefaults()
func TestSetViperDefaultsWithHealthCheck(t *testing.T) {
confData, hostName, defaultCaptureSeconds := setupTestSetViperDefaults(collects.HealthCheckCollection)

checks := []struct {
key string
Expand Down Expand Up @@ -92,3 +92,121 @@ func TestSetViperDefaults(t *testing.T) {
}
}
}

func TestSetViperDefaultsQuickCollect(t *testing.T) {
confData, hostName, defaultCaptureSeconds := setupTestSetViperDefaults(collects.QuickCollection)
checks := []struct {
key string
expected interface{}
}{
{conf.KeyDisableRESTAPI, false},
{conf.KeyCollectAccelerationLog, false},
{conf.KeyCollectAccessLog, false},
{conf.KeyCollectAuditLog, false},
{conf.KeyCollectJVMFlags, true},
{conf.KeyDremioLogDir, "/var/log/dremio"},
{conf.KeyNumberThreads, 1},
{conf.KeyDremioPid, 0},
{conf.KeyDremioPidDetection, true},
{conf.KeyDremioUsername, "dremio"},
{conf.KeyDremioPatToken, ""},
{conf.KeyDremioConfDir, "/opt/dremio/conf"},
{conf.KeyDremioRocksdbDir, "/opt/dremio/data/db"},
{conf.KeyCollectDremioConfiguration, true},
{conf.KeyCaptureHeapDump, false},
{conf.KeyNumberJobProfiles, 20},
{conf.KeyDremioEndpoint, "http://localhost:9047"},
{conf.KeyTarballOutDir, "/tmp/ddc"},
{conf.KeyCollectOSConfig, true},
{conf.KeyCollectDiskUsage, true},
{conf.KeyDremioLogsNumDays, 2},
{conf.KeyDremioQueriesJSONNumDays, 2},
{conf.KeyDremioGCFilePattern, "gc*.log*"},
{conf.KeyCollectQueriesJSON, true},
{conf.KeyCollectServerLogs, true},
{conf.KeyCollectMetaRefreshLog, true},
{conf.KeyCollectReflectionLog, true},
{conf.KeyCollectGCLogs, true},
{conf.KeyCollectJFR, false},
{conf.KeyCollectJStack, false},
{conf.KeyCollectSystemTablesExport, true},
{conf.KeyCollectWLM, true},
{conf.KeyCollectTtop, false},
{conf.KeyCollectKVStoreReport, true},
{conf.KeyDremioJStackTimeSeconds, defaultCaptureSeconds},
{conf.KeyDremioJFRTimeSeconds, defaultCaptureSeconds},
{conf.KeyDremioJStackFreqSeconds, 1},
{conf.KeyDremioTtopFreqSeconds, 1},
{conf.KeyDremioTtopTimeSeconds, defaultCaptureSeconds},
{conf.KeyDremioGCLogsDir, ""},
{conf.KeyNodeName, hostName},
{conf.KeyAcceptCollectionConsent, true},
{conf.KeyAllowInsecureSSL, true},
}

for _, check := range checks {
actual := confData[check.key]
if actual != check.expected {
t.Errorf("Unexpected value for '%s'. Got %v, expected %v", check.key, actual, check.expected)
}
}
}

func TestSetViperDefaults(t *testing.T) {
confData, hostName, defaultCaptureSeconds := setupTestSetViperDefaults(collects.StandardCollection)
checks := []struct {
key string
expected interface{}
}{
{conf.KeyDisableRESTAPI, false},
{conf.KeyCollectAccelerationLog, false},
{conf.KeyCollectAccessLog, false},
{conf.KeyCollectAuditLog, false},
{conf.KeyCollectJVMFlags, true},
{conf.KeyDremioLogDir, "/var/log/dremio"},
{conf.KeyNumberThreads, 2},
{conf.KeyDremioPid, 0},
{conf.KeyDremioPidDetection, true},
{conf.KeyDremioUsername, "dremio"},
{conf.KeyDremioPatToken, ""},
{conf.KeyDremioConfDir, "/opt/dremio/conf"},
{conf.KeyDremioRocksdbDir, "/opt/dremio/data/db"},
{conf.KeyCollectDremioConfiguration, true},
{conf.KeyCaptureHeapDump, false},
{conf.KeyNumberJobProfiles, 20},
{conf.KeyDremioEndpoint, "http://localhost:9047"},
{conf.KeyTarballOutDir, "/tmp/ddc"},
{conf.KeyCollectOSConfig, true},
{conf.KeyCollectDiskUsage, true},
{conf.KeyDremioLogsNumDays, 7},
{conf.KeyDremioQueriesJSONNumDays, 30},
{conf.KeyDremioGCFilePattern, "gc*.log*"},
{conf.KeyCollectQueriesJSON, true},
{conf.KeyCollectServerLogs, true},
{conf.KeyCollectMetaRefreshLog, true},
{conf.KeyCollectReflectionLog, true},
{conf.KeyCollectGCLogs, true},
{conf.KeyCollectJFR, true},
{conf.KeyCollectJStack, false},
{conf.KeyCollectSystemTablesExport, true},
{conf.KeyCollectWLM, true},
{conf.KeyCollectTtop, true},
{conf.KeyCollectKVStoreReport, true},
{conf.KeyDremioJStackTimeSeconds, defaultCaptureSeconds},
{conf.KeyDremioJFRTimeSeconds, defaultCaptureSeconds},
{conf.KeyDremioJStackFreqSeconds, 1},
{conf.KeyDremioTtopFreqSeconds, 1},
{conf.KeyDremioTtopTimeSeconds, defaultCaptureSeconds},
{conf.KeyDremioGCLogsDir, ""},
{conf.KeyNodeName, hostName},
{conf.KeyAcceptCollectionConsent, true},
{conf.KeyAllowInsecureSSL, true},
}

for _, check := range checks {
actual := confData[check.key]
if actual != check.expected {
t.Errorf("Unexpected value for '%s'. Got %v, expected %v", check.key, actual, check.expected)
}
}
}
47 changes: 25 additions & 22 deletions cmd/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,42 @@ func collect(c *conf.CollectConf) error {
if err := createAllDirs(c); err != nil {
return fmt.Errorf("unable to create directories due to error %w", err)
}

// we can probably remove this now that we have gone to single threaded, but keeping it for the delayed execution and logging for now
t, err := threading.NewThreadPool(1, 1, true, false)
if err != nil {
return fmt.Errorf("unable to spawn thread pool: %w", err)
}

wrapConfigJob := func(name string, j func(c *conf.CollectConf) error) threading.Job {
return threading.Job{
Name: name,
Process: func() error { return j(c) },
}
}

// rest call so we move it the front in case the token expires
if !c.CollectWLM() {
simplelog.Debug("Skipping Workload Manager report collection")
} else {
t.AddJob(wrapConfigJob("WLM COLLECTION", apicollect.RunCollectWLM))
}

// rest call so we move it the front in case the token expires
if !c.CollectSystemTablesExport() {
simplelog.Debug("Skipping system tables collection")
} else {
t.AddJob(wrapConfigJob("SYSTEM TABLE COLLECTION", apicollect.RunCollectDremioSystemTables))
}

if !c.IsDremioCloud() {
// rest call so we move it the front in case the token expires
if !c.CollectKVStoreReport() {
simplelog.Debug("Skipping KV store report collection")
} else {
t.AddJob(wrapConfigJob("KV STORE COLLECTION", apicollect.RunCollectKvReport))
}

if !c.CollectDiskUsage() {
simplelog.Info("Skipping disk usage collection")
} else {
Expand All @@ -141,7 +165,6 @@ func collect(c *conf.CollectConf) error {
}

// log collection

logCollector := logcollect.NewLogCollector(
c.DremioLogDir(),
c.LogsOutDir(),
Expand Down Expand Up @@ -233,13 +256,6 @@ func collect(c *conf.CollectConf) error {
} else {
t.AddJob(wrapConfigJob("JVM FLAG COLLECTION", jvmcollect.RunCollectJVMFlags))
}
// rest call collections

if !c.CollectKVStoreReport() {
simplelog.Debug("Skipping KV store report collection")
} else {
t.AddJob(wrapConfigJob("KV STORE COLLECTION", apicollect.RunCollectKvReport))
}

if !c.CollectTtop() {
simplelog.Debugf("Skipping ttop collection")
Expand All @@ -265,31 +281,18 @@ func collect(c *conf.CollectConf) error {
}
}

if !c.CollectWLM() {
simplelog.Debug("Skipping Workload Manager report collection")
} else {
t.AddJob(wrapConfigJob("WLM COLLECTION", apicollect.RunCollectWLM))
}

if !c.CollectSystemTablesExport() {
simplelog.Debug("Skipping system tables collection")
} else {
t.AddJob(wrapConfigJob("SYSTEM TABLE COLLECTION", apicollect.RunCollectDremioSystemTables))
}

if err := t.ProcessAndWait(); err != nil {
simplelog.Errorf("thread pool has an error: %v", err)
}

//we wait on the thread pool to empty out as this is also multithreaded and takes the longest
// this has to happen after the queries.json collection so we don't have much choice and have to leave it here
if c.NumberJobProfilesToCollect() == 0 {
simplelog.Debugf("Skipping job profiles collection")
} else {
if err := apicollect.RunCollectJobProfiles(c); err != nil {
simplelog.Errorf("during job profile collection there was an error: %v", err)
}
}

if err := runCollectClusterStats(c); err != nil {
simplelog.Errorf("during unable to collect cluster stats like cluster ID: %v", err)
}
Expand Down
2 changes: 1 addition & 1 deletion default-ddc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# collect-access-log: false
# collect-audit-log: false
# collect-dremio-configuration: true # will collect dremio.conf, dremio-env, logback.xml and logback-access.xml
# number-job-profiles: 25000 # up to this number, may have less due to duplicates NOTE: need to have the dremio-pat-token set to work
# number-job-profiles: 20 # this is 25000 when a health check is selected up to this number, may have less due to duplicates NOTE: need to have the dremio-pat-token set to work
# capture-heap-dump: false # when true a heap dump will be captured on each node that the collector is run against
# accept-collection-consent: true # when true you accept consent to collect data on each node, if false collection will fail
# allow-insecure-ssl: true # when true skip the ssl cert check when doing API calls
Expand Down

0 comments on commit f2e6d33

Please sign in to comment.