Skip to content

Commit

Permalink
Merge pull request #8559 from shirady/nsfs-nc-disable-Prometheus-serv…
Browse files Browse the repository at this point in the history
…er-start

NC | NSFS | Wrap with `try-catch` Prometheus Reporting `start_server`
  • Loading branch information
shirady authored Dec 4, 2024
2 parents e9808eb + 09c4c45 commit c03e592
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 15 deletions.
12 changes: 11 additions & 1 deletion src/endpoint/endpoint.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,17 @@ async function main(options = {}) {
// the primary just forks and returns, workers will continue to serve
fork_count = options.forks ?? config.ENDPOINT_FORKS;
const metrics_port = options.metrics_port || config.EP_METRICS_SERVER_PORT;
if (fork_utils.start_workers(metrics_port, fork_count)) return;
/**
* Please notice that we can run the main in 2 states:
* 1. Only the primary process runs the main (fork is 0 or undefined) - everything that
* is implemented here would be run by this process.
* 2. A primary process with multiple forks (IMPORTANT) - if there is implementation that
* in only relevant to the primary process it should be implemented in
* fork_utils.start_workers because the primary process returns after start_workers
* and the forks will continue executing the code lines in this function
* */
const is_workers_started_from_primary = await fork_utils.start_workers(metrics_port, fork_count);
if (is_workers_started_from_primary) return;

const endpoint_group_id = process.env.ENDPOINT_GROUP_ID || 'default-endpoint-group';

Expand Down
12 changes: 10 additions & 2 deletions src/manage_nsfs/diagnose.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,17 @@ async function gather_metrics() {
const buffer = await buffer_utils.read_stream_join(res);
const body = buffer.toString('utf8');
metrics_output = JSON.parse(body);
if (!metrics_output) throw new Error('received empty metrics response', { cause: res.statusCode });
write_stdout_response(ManageCLIResponse.MetricsStatus, metrics_output);
} else if (res.statusCode >= 500 && res.rawHeaders.includes('application/json')) {
const buffer = await buffer_utils.read_stream_join(res);
const body = buffer.toString('utf8');
const error_output = JSON.parse(body);
if (!error_output) throw new Error('received empty metrics response', { cause: res.statusCode });
throw_cli_error({ ...ManageCLIError.MetricsStatusFailed, ...error_output });
} else {
throw new Error('received empty metrics response', { cause: res.statusCode });
}
if (!metrics_output) throw new Error('recieved empty metrics response', { cause: res.statusCode });
write_stdout_response(ManageCLIResponse.MetricsStatus, metrics_output);
} catch (err) {
dbg.warn('could not receive metrics response', err);
throw_cli_error({ ...ManageCLIError.MetricsStatusFailed, cause: err?.errors?.[0] || err });
Expand Down
27 changes: 20 additions & 7 deletions src/server/analytic_services/prometheus_reporting.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,7 @@ async function start_server(
const server = http.createServer(async (req, res) => {
// Serve all metrics on the root path for system that do have one or more fork running.
if (fork_enabled) {
const metrics = await aggregatorRegistry.clusterMetrics();
if (req.url === '' || req.url === '/') {
res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType });
res.end(metrics);
return;
}
// we would like this part to be first as clusterMetrics might fail.
if (req.url === '/metrics/nsfs_stats') {
res.writeHead(200, { 'Content-Type': 'text/plain' });
const nsfs_report = {
Expand All @@ -77,6 +72,24 @@ async function start_server(
res.end(JSON.stringify(nsfs_report));
return;
}
let metrics;
try {
metrics = await aggregatorRegistry.clusterMetrics();
} catch (err) {
dbg.error('start_server: Could not get the metrics, got an error', err);
res.writeHead(504, { 'Content-Type': 'application/json' });
const reply = JSON.stringify({
error: 'Internal server error - timeout',
message: 'Looks like the server is taking a long time to respond (Could not get the metrics)',
});
res.end(reply);
return;
}
if (req.url === '' || req.url === '/') {
res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType });
res.end(metrics);
return;
}
// Serve report's metrics on the report name path
const report_name = req.url.substr(1);
const single_metrics = export_single_metrics(metrics, report_name);
Expand Down Expand Up @@ -165,7 +178,7 @@ async function metrics_nsfs_stats_handler() {
op_stats_counters: op_stats_counters,
fs_worker_stats_counters: fs_worker_stats_counters
};
dbg.log1(`_create_nsfs_report: nsfs_report ${nsfs_report}`);
dbg.log1('_create_nsfs_report: nsfs_report', nsfs_report);
return JSON.stringify(nsfs_report);
}

Expand Down
10 changes: 5 additions & 5 deletions src/util/fork_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ const fs_workers_stats = {};
*
* @param {number?} count number of workers to start.
* @param {number?} metrics_port prometheus metris port.
* @returns {boolean} true if workers were started.
* @returns {Promise<boolean>} true if workers were started.
*/
function start_workers(metrics_port, count = 0) {
async function start_workers(metrics_port, count = 0) {
const exit_events = [];
if (cluster.isPrimary && count > 0) {
for (let i = 0; i < count; ++i) {
Expand Down Expand Up @@ -68,12 +68,12 @@ function start_workers(metrics_port, count = 0) {
});
for (const id in cluster.workers) {
if (id) {
cluster.workers[id].on('message', nsfs_io_state_handler);
cluster.workers[id].on('message', nsfs_io_stats_handler);
}
}
if (metrics_port > 0) {
dbg.log0('Starting metrics server', metrics_port);
prom_reporting.start_server(metrics_port, true);
await prom_reporting.start_server(metrics_port, true);
dbg.log0('Started metrics server successfully');
}
return true;
Expand All @@ -82,7 +82,7 @@ function start_workers(metrics_port, count = 0) {
return false;
}

function nsfs_io_state_handler(msg) {
function nsfs_io_stats_handler(msg) {
if (msg.io_stats) {
for (const [key, value] of Object.entries(msg.io_stats)) {
io_stats[key] += value;
Expand Down

0 comments on commit c03e592

Please sign in to comment.