Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skip cgroup monitoring if log collector doesn't start by the agent. #2939

Merged
merged 3 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions azurelinuxagent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import sys
import threading
from azurelinuxagent.ga import logcollector, cgroupconfigurator
from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup
from azurelinuxagent.ga.cgroupapi import SystemdCgroupsApi

import azurelinuxagent.common.conf as conf
Expand Down Expand Up @@ -204,11 +205,10 @@ def collect_logs(self, is_full_mode):
logger.info("Running log collector mode normal")

# Check the cgroups unit
cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None
if CollectLogsHandler.should_validate_cgroups():
narrieta marked this conversation as resolved.
Show resolved Hide resolved
cgroups_api = SystemdCgroupsApi()
cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")

log_collector_monitor = None
cgroups_api = SystemdCgroupsApi()
cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")
if CollectLogsHandler.is_enabled_monitor_cgroups_check():
cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)

Expand All @@ -221,10 +221,24 @@ def collect_logs(self, is_full_mode):

sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)

def initialize_cgroups_tracking(cpu_cgroup_path, memory_cgroup_path):
cpu_cgroup = CpuCgroup(AGENT_LOG_COLLECTOR, cpu_cgroup_path)
msg = "Started tracking cpu cgroup {0}".format(cpu_cgroup)
logger.info(msg)
cpu_cgroup.initialize_cpu_usage()
memory_cgroup = MemoryCgroup(AGENT_LOG_COLLECTOR, memory_cgroup_path)
msg = "Started tracking memory cgroup {0}".format(memory_cgroup)
logger.info(msg)
return [cpu_cgroup, memory_cgroup]

try:
log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path)
log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups)
log_collector_monitor.run()
log_collector = LogCollector(is_full_mode)
# Running log collector resource(CPU, Memory) monitoring only if agent starts the log collector.
# If Log collector start by any other means, then it will not be monitored.
if CollectLogsHandler.is_enabled_monitor_cgroups_check():
tracked_cgroups = initialize_cgroups_tracking(cpu_cgroup_path, memory_cgroup_path)
log_collector_monitor = get_log_collector_monitor_handler(tracked_cgroups)
log_collector_monitor.run()
archive = log_collector.collect_logs_and_get_archive()
logger.info("Log collection successfully completed. Archive can be found at {0} "
"and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH))
Expand Down
10 changes: 5 additions & 5 deletions azurelinuxagent/ga/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,16 @@ def get_thread_name():
return CollectLogsHandler._THREAD_NAME

@staticmethod
def enable_cgroups_validation():
def enable_monitor_cgroups_check():
os.environ[CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE] = "1"

@staticmethod
def disable_cgroups_validation():
def disable_monitor_cgroups_check():
if CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE in os.environ:
del os.environ[CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE]

@staticmethod
def should_validate_cgroups():
def is_enabled_monitor_cgroups_check():
if CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE in os.environ:
return os.environ[CollectLogsHandler.__CGROUPS_FLAG_ENV_VARIABLE] == "1"
return False
Expand Down Expand Up @@ -147,7 +147,7 @@ def daemon(self):
time.sleep(_INITIAL_LOG_COLLECTION_DELAY)

try:
CollectLogsHandler.enable_cgroups_validation()
CollectLogsHandler.enable_monitor_cgroups_check()
if self.protocol_util is None or self.protocol is None:
self.init_protocols()

Expand All @@ -162,7 +162,7 @@ def daemon(self):
except Exception as e:
logger.error("An error occurred in the log collection thread; will exit the thread.\n{0}", ustr(e))
finally:
CollectLogsHandler.disable_cgroups_validation()
CollectLogsHandler.disable_monitor_cgroups_check()

def collect_and_send_logs(self):
if self._collect_logs():
Expand Down
15 changes: 1 addition & 14 deletions azurelinuxagent/ga/logcollector.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from datetime import datetime
from heapq import heappush, heappop

from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_LOG_COLLECTOR, MemoryCgroup
from azurelinuxagent.common.conf import get_lib_dir, get_ext_log_dir, get_agent_log_file
from azurelinuxagent.common.event import initialize_event_logger_vminfo_common_parameters
from azurelinuxagent.common.future import ustr
Expand Down Expand Up @@ -71,14 +70,13 @@ class LogCollector(object):

_TRUNCATED_FILE_PREFIX = "truncated_"

def __init__(self, is_full_mode=False, cpu_cgroup_path=None, memory_cgroup_path=None):
def __init__(self, is_full_mode=False):
self._is_full_mode = is_full_mode
self._manifest = MANIFEST_FULL if is_full_mode else MANIFEST_NORMAL
self._must_collect_files = self._expand_must_collect_files()
self._create_base_dirs()
self._set_logger()
self._initialize_telemetry()
self.cgroups = self._set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path)

@staticmethod
def _mkdir(dirname):
Expand All @@ -105,17 +103,6 @@ def _set_logger():
_LOGGER.addHandler(_f_handler)
_LOGGER.setLevel(logging.INFO)

@staticmethod
def _set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path):
cpu_cgroup = CpuCgroup(AGENT_LOG_COLLECTOR, cpu_cgroup_path)
msg = "Started tracking cpu cgroup {0}".format(cpu_cgroup)
_LOGGER.info(msg)
cpu_cgroup.initialize_cpu_usage()
memory_cgroup = MemoryCgroup(AGENT_LOG_COLLECTOR, memory_cgroup_path)
msg = "Started tracking memory cgroup {0}".format(memory_cgroup)
_LOGGER.info(msg)
return [cpu_cgroup, memory_cgroup]

@staticmethod
def _initialize_telemetry():
protocol = get_protocol_util().get_protocol(init_goal_state=False)
Expand Down
16 changes: 8 additions & 8 deletions tests/ga/test_logcollector.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def test_log_collector_parses_commands_in_manifest(self):

with patch("azurelinuxagent.ga.logcollector.MANIFEST_NORMAL", manifest):
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
archive = log_collector.collect_logs_and_get_archive()

with open(self.output_results_file_path, "r") as fh:
Expand Down Expand Up @@ -241,7 +241,7 @@ def test_log_collector_uses_full_manifest_when_full_mode_enabled(self):

with patch("azurelinuxagent.ga.logcollector.MANIFEST_FULL", manifest):
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(is_full_mode=True, cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector(is_full_mode=True)
archive = log_collector.collect_logs_and_get_archive()

self._assert_archive_created(archive)
Expand All @@ -255,7 +255,7 @@ def test_log_collector_should_collect_all_files(self):
# and combined they do not cross the archive size threshold.

with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
archive = log_collector.collect_logs_and_get_archive()

self._assert_archive_created(archive)
Expand All @@ -277,7 +277,7 @@ def test_log_collector_should_truncate_large_text_files_and_ignore_large_binary_
# Set the size limit so that some files are too large to collect in full.
with patch("azurelinuxagent.ga.logcollector._FILE_SIZE_LIMIT", SMALL_FILE_SIZE):
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
archive = log_collector.collect_logs_and_get_archive()

self._assert_archive_created(archive)
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_log_collector_should_prioritize_important_files_if_archive_too_big(self
with patch("azurelinuxagent.ga.logcollector._UNCOMPRESSED_ARCHIVE_SIZE_LIMIT", 10 * 1024 * 1024):
with patch("azurelinuxagent.ga.logcollector._MUST_COLLECT_FILES", must_collect_files):
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
archive = log_collector.collect_logs_and_get_archive()

self._assert_archive_created(archive)
Expand Down Expand Up @@ -362,7 +362,7 @@ def test_log_collector_should_update_archive_when_files_are_new_or_modified_or_d
# Ensure the archive reflects the state of files on the disk at collection time. If a file was updated, it
# needs to be updated in the archive, deleted if removed from disk, and added if not previously seen.
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
first_archive = log_collector.collect_logs_and_get_archive()
self._assert_archive_created(first_archive)

Expand Down Expand Up @@ -433,7 +433,7 @@ def test_log_collector_should_clean_up_uncollected_truncated_files(self):
with patch("azurelinuxagent.ga.logcollector._MUST_COLLECT_FILES", must_collect_files):
with patch("azurelinuxagent.ga.logcollector._FILE_SIZE_LIMIT", SMALL_FILE_SIZE):
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
archive = log_collector.collect_logs_and_get_archive()

self._assert_archive_created(archive)
Expand All @@ -455,7 +455,7 @@ def test_log_collector_should_clean_up_uncollected_truncated_files(self):
with patch("azurelinuxagent.ga.logcollector._MUST_COLLECT_FILES", must_collect_files):
with patch("azurelinuxagent.ga.logcollector._FILE_SIZE_LIMIT", SMALL_FILE_SIZE):
with patch('azurelinuxagent.ga.logcollector.LogCollector._initialize_telemetry'):
log_collector = LogCollector(cpu_cgroup_path="dummy_cpu_path", memory_cgroup_path="dummy_memory_path")
log_collector = LogCollector()
second_archive = log_collector.collect_logs_and_get_archive()

expected_files = [
Expand Down
8 changes: 4 additions & 4 deletions tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def test_calls_collect_logs_with_proper_mode(self, mock_log_collector, *args):
@patch("azurelinuxagent.agent.LogCollector")
def test_calls_collect_logs_on_valid_cgroups(self, mock_log_collector):
try:
CollectLogsHandler.enable_cgroups_validation()
CollectLogsHandler.enable_monitor_cgroups_check()
mock_log_collector.run = Mock()

def mock_cgroup_paths(*args, **kwargs):
Expand All @@ -248,12 +248,12 @@ def mock_cgroup_paths(*args, **kwargs):

mock_log_collector.assert_called_once()
finally:
CollectLogsHandler.disable_cgroups_validation()
CollectLogsHandler.disable_monitor_cgroups_check()

@patch("azurelinuxagent.agent.LogCollector")
def test_doesnt_call_collect_logs_on_invalid_cgroups(self, mock_log_collector):
try:
CollectLogsHandler.enable_cgroups_validation()
CollectLogsHandler.enable_monitor_cgroups_check()
mock_log_collector.run = Mock()

def mock_cgroup_paths(*args, **kwargs):
Expand All @@ -272,7 +272,7 @@ def mock_cgroup_paths(*args, **kwargs):
mock_exit.assert_called_once_with(logcollector.INVALID_CGROUPS_ERRCODE)
self.assertEqual(exit_error, re)
finally:
CollectLogsHandler.disable_cgroups_validation()
CollectLogsHandler.disable_monitor_cgroups_check()

def test_it_should_parse_setup_firewall_properly(self):

Expand Down
Loading