-
Notifications
You must be signed in to change notification settings - Fork 905
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(webserver): implement metrics endpoint #3140
Changes from 10 commits
aa4ae6f
d527083
1bb9b4e
e0f9c92
d6e980e
0cf84c8
beda9dd
9d3e602
d575380
c940ceb
71e1783
459f4bd
aa0e813
34436d6
6746e3b
2c2db8d
2a60be3
494ecc2
934df4c
b587754
815af40
77a6a71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
/* | ||
Copyright (C) 2024 The Falco Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
#include "falco_metrics.h" | ||
|
||
#include "app/state.h" | ||
|
||
#include <libsinsp/sinsp.h> | ||
|
||
falco_metrics::falco_metrics(falco::app::state& state) | ||
{ | ||
falco_configuration::webserver_config webserver_config = state.config->m_webserver_config; | ||
m_metrics_enabled = state.config->m_metrics_enabled && webserver_config.m_metrics_enabled; | ||
|
||
if (m_metrics_enabled) | ||
{ | ||
for (const auto& source_info: state.source_infos) | ||
{ | ||
sinsp *source_inspector = source_info.inspector.get(); | ||
m_inspectors.push_back(source_inspector); | ||
m_metrics_collectors.push_back(libs::metrics::libs_metrics_collector(source_inspector, state.config->m_metrics_flags)); | ||
} | ||
} | ||
} | ||
|
||
std::string falco_metrics::to_text() const | ||
{ | ||
if (!m_metrics_enabled) | ||
{ | ||
return ""; | ||
} | ||
|
||
static const char* all_driver_engines[] = { | ||
BPF_ENGINE, KMOD_ENGINE, MODERN_BPF_ENGINE, | ||
SOURCE_PLUGIN_ENGINE, NODRIVER_ENGINE, GVISOR_ENGINE }; | ||
|
||
|
||
libs::metrics::prometheus_metrics_converter prometheus_metrics_converter; | ||
std::string prometheus_text; | ||
|
||
for (auto* inspector: m_inspectors) | ||
{ | ||
for (size_t i = 0; i < sizeof(all_driver_engines) / sizeof(const char*); i++) | ||
{ | ||
if (inspector->check_current_engine(all_driver_engines[i])) | ||
{ | ||
prometheus_text += prometheus_metrics_converter.convert_metric_to_text_prometheus("engine_name", "falcosecurity", "scap", {{"engine_name", all_driver_engines[i]}}); | ||
break; | ||
} | ||
} | ||
|
||
const scap_agent_info* agent_info = inspector->get_agent_info(); | ||
const scap_machine_info* machine_info = inspector->get_machine_info(); | ||
|
||
libs::metrics::libs_metrics_collector libs_metrics_collector(inspector, 0); | ||
|
||
prometheus_text += prometheus_metrics_converter.convert_metric_to_text_prometheus("falco_version", "falcosecurity", "falco", {{"falco_version", FALCO_VERSION}}); | ||
prometheus_text += prometheus_metrics_converter.convert_metric_to_text_prometheus("kernel_release", "falcosecurity", "falco", {{"kernel_release", agent_info->uname_r}}); | ||
prometheus_text += prometheus_metrics_converter.convert_metric_to_text_prometheus("hostname", "falcosecurity", "evt", {{"hostname", machine_info->hostname}}); | ||
|
||
std::vector<metrics_v2> static_metrics; | ||
static_metrics.push_back(libs_metrics_collector.new_metric("start_ts", | ||
sgaist marked this conversation as resolved.
Show resolved
Hide resolved
|
||
METRICS_V2_MISC, | ||
METRIC_VALUE_TYPE_U64, | ||
METRIC_VALUE_UNIT_TIME_TIMESTAMP_NS, | ||
METRIC_VALUE_METRIC_TYPE_NON_MONOTONIC_CURRENT, | ||
agent_info->start_ts_epoch)); | ||
static_metrics.push_back(libs_metrics_collector.new_metric("host_boot_ts", | ||
METRICS_V2_MISC, | ||
METRIC_VALUE_TYPE_U64, | ||
METRIC_VALUE_UNIT_TIME_TIMESTAMP_NS, | ||
METRIC_VALUE_METRIC_TYPE_NON_MONOTONIC_CURRENT, | ||
machine_info->boot_ts_epoch)); | ||
static_metrics.push_back(libs_metrics_collector.new_metric("host_num_cpus", | ||
METRICS_V2_MISC, | ||
METRIC_VALUE_TYPE_U32, | ||
METRIC_VALUE_UNIT_COUNT, | ||
METRIC_VALUE_METRIC_TYPE_NON_MONOTONIC_CURRENT, | ||
machine_info->num_cpus)); | ||
|
||
for (auto metrics: static_metrics) | ||
incertum marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
prometheus_metrics_converter.convert_metric_to_unit_convention(metrics); | ||
prometheus_text += prometheus_metrics_converter.convert_metric_to_text_prometheus(metrics, "falcosecurity", "falco"); | ||
} | ||
} | ||
|
||
for (auto metrics_collector: m_metrics_collectors) | ||
{ | ||
metrics_collector.snapshot(); | ||
auto metrics_snapshot = metrics_collector.get_metrics(); | ||
|
||
for (auto& metrics: metrics_snapshot) | ||
{ | ||
prometheus_metrics_converter.convert_metric_to_unit_convention(metrics); | ||
std::string namespace_name = "scap"; | ||
if (metrics.flags & METRICS_V2_RESOURCE_UTILIZATION || metrics.flags & METRICS_V2_KERNEL_COUNTERS) | ||
{ | ||
namespace_name = "falco"; | ||
} | ||
prometheus_text += prometheus_metrics_converter.convert_metric_to_text_prometheus(metrics, "falcosecurity", namespace_name); | ||
} | ||
|
||
} | ||
return prometheus_text; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
/* | ||
Copyright (C) 2024 The Falco Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include "configuration.h" | ||
|
||
#include <libsinsp/sinsp.h> | ||
|
||
namespace falco::app { | ||
struct state; | ||
} | ||
|
||
class falco_metrics | ||
{ | ||
public: | ||
falco_metrics(falco::app::state& state); | ||
bool is_enabled() const { return m_metrics_enabled; }; | ||
std::string to_text() const; | ||
|
||
private: | ||
bool m_metrics_enabled = false; | ||
std::vector<sinsp*> m_inspectors; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sgaist would you mind getting me up to speed why we need a vector with inspectors? Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is related to the initialization state of the sinsp objects when starting the webserver. In my original implementation, all static stuff was created in the constructor of the It's a temporary design to have something working that we can discuss about. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @FedeDP @Andreagit97 plz advice, do we really need the vectors here because of the various source_infos. https://github.com/falcosecurity/falco/pull/3140/files#diff-0ebdd484eb8f6b10eb0df365c052ec7a91640bd6502ac3a7af1da82d9e62f3b1R31. We don't distinguish between the source info for the stats_writer either ... conversely maybe we should. Right now we primarily advertise metrics for syscalls source -- it's not properly working for plugin only either atm given the scap refactor regression. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd cc @jasondellaluce on this point, he might be able to help us :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @FedeDP @jasondellaluce @leogr +1 would appreciate final guidance on what you prefer so that we can get this over the finish line. Thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have strong opinions, but I agree with this 👇
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Re. the raw pointers, generally non-owning raw pointers are fine but the problems we can have is when these objects are free'd or reallocated by their owners. I don't completely remember the complete mechanism that happens in the restart case, but will this work if Falco is restarted, e.g. when a rule is replaced and Falco auto-reloads (therefore rebuilding inspectors)? iirc it should but could you please double check? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I see your point about creating non-mutating info at construction time, however I think that these non-owning raw pointers can create more harm than benefit. The issue is that they rely on leaked knowledge about:
So IMO it's not just the raw pointer (that could be solved by making it owner through a shared_ptr instance), but more that we leak implicit knowledge about other parts of the codebase. IMO, moving everything in I'm not gonna fight for this of course, but this is my point overall. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In addition to your excellent points, I did not take into account the dynamic reloading use case which also goes in the favor of your suggested implementation so I will do that. I think the lazy caching is a good idea for a follow up PR. |
||
std::vector<libs::metrics::libs_metrics_collector> m_metrics_collectors; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we sure we want to overload the
listen_port
or instead add anothermetrics_port
config?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought that the same webserver would serve both endpoints.
This brings the following question: should we have different web server instance for the metrics part ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair, you actually asked me at the beginning if we should just use the existing webserver and I said yes as for me I was referring to existing code ... What would you suggest? Or what is better practice if you want to expose multiple ports? @FedeDP what's you take on this?
I personally never set up webservers, instead if anything I tried to break into them in the past 🙃 .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should reuse the same webserver we already use; having everything on the same ports on different endpoints is ok from my PoV, indeed it is better to remember "right, on port X there is Falco webserver".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok then it seems settled, we keep it as is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree.
@sgaist This works for me too 👍
@incertum We noticed that an array of strings does not play well with the
-o
command line flag. So, I'd recommend using multiple_enabled
options so that it's easy to do-o foo.bar_enabled=true
.Again, I strongly believe we have to re-audit all config options and make them consistent, but it's out of scope for this PR, so I'd leave this decision for later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we might want to call the config
prometheus_enabled
?Anyway,
metrics_enabled
works fine for me too. I'd avoid the complexity to deal with possible multiple metrics backend atm.We can always introduce the new feat as
Incubating
to be able to play with it and be free to drop it/rename it in the future: https://github.com/falcosecurity/falco/blob/master/proposals/20231220-features-adoption-and-deprecation.md#maturation-levelsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right @leogr re arrays configs. @FedeDP would then just prometheus_metrics_enabled be ok? I believe it could be beneficial for this to show up when string searching for metrics. In addition, its even clearer for newbies to metrics.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1! Looks fine to me!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took a look at this. To make the config file easier to use I think it's important to highlight that you need both
metrics_enabled
andmetrics.enabled
to true in order for this to have any effect. So I would suggestprometheus_metrics_enabled
with a comment that mentions that this only does something ifmetrics.enabled
is true.Technically I would also be ok with setting
prometheus_metrics_enabled
totrue
since metrics are disabled by default but maybe let's do it after the support is stable.