Skip to content

Commit

Permalink
CP-52524: Generate an alert when various host kernel taints are set (#…
Browse files Browse the repository at this point in the history
…6128)

Issue an alert about a broken host kernel if bits 4, 5, 7, 9, or 14 are
set in
`/proc/sys/kernel/tainted`, indicating some kind of error was
encountered and the
future behaviour of the kernel might not be predictable or safe anymore
(though
it generally should reasonably recover).

Only one alert per tainted bit per boot should be issued.

Distinguish between Major (4,5,7 - these are all things that might cause
a
host crash, but are unlikely to corrupt whatever data has been written
out) and
Warning (9, 14 - might be a concern and could be raised to Support but
usually
are not severe enough to crash the host) levels of errors as
suggested by the Foundations team.

This should serve as an indicator during issue investigation to look for
the
cause of the taint.
  • Loading branch information
last-genius authored Dec 3, 2024
2 parents e2f96bf + aaabb6c commit b782202
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 19 deletions.
6 changes: 6 additions & 0 deletions ocaml/xapi-consts/api_messages.ml
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,12 @@ let host_internal_certificate_expiring_07 =

let failed_login_attempts = addMessage "FAILED_LOGIN_ATTEMPTS" 3L

let kernel_is_broken which =
addMessage ("HOST_KERNEL_ENCOUNTERED_ERROR_" ^ which) 2L

let kernel_is_broken_warning which =
addMessage ("HOST_KERNEL_ENCOUNTERED_WARNING_" ^ which) 3L

let tls_verification_emergency_disabled =
addMessage "TLS_VERIFICATION_EMERGENCY_DISABLED" 3L

Expand Down
27 changes: 15 additions & 12 deletions ocaml/xapi/dbsync_slave.ml
Original file line number Diff line number Diff line change
Expand Up @@ -63,21 +63,24 @@ let create_localhost ~__context info =
in
()

(* TODO cat /proc/stat for btime ? *)
let get_start_time () =
try
debug "Calculating boot time..." ;
let now = Unix.time () in
let uptime = Unixext.string_of_file "/proc/uptime" in
let uptime = String.trim uptime in
let uptime = String.split ' ' uptime in
let uptime = List.hd uptime in
let uptime = float_of_string uptime in
let boot_time = Date.of_unix_time (now -. uptime) in
debug " system booted at %s" (Date.to_rfc3339 boot_time) ;
boot_time
match
Unixext.string_of_file "/proc/stat"
|> String.trim
|> String.split '\n'
|> List.find (fun s -> String.starts_with ~prefix:"btime" s)
|> String.split ' '
with
| _ :: btime :: _ ->
let boot_time = Date.of_unix_time (float_of_string btime) in
debug "%s: system booted at %s" __FUNCTION__ (Date.to_rfc3339 boot_time) ;
boot_time
| _ ->
failwith "Couldn't parse /proc/stat"
with e ->
debug "Calculating boot time failed with '%s'" (ExnHelper.string_of_exn e) ;
debug "%s: Calculating boot time failed with '%s'" __FUNCTION__
(ExnHelper.string_of_exn e) ;
Date.epoch

(* not sufficient just to fill in this data on create time [Xen caps may change if VT enabled in BIOS etc.] *)
Expand Down
75 changes: 75 additions & 0 deletions ocaml/xapi/xapi_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2923,6 +2923,81 @@ let emergency_reenable_tls_verification ~__context =
Helpers.touch_file Constants.verify_certificates_path ;
Db.Host.set_tls_verification_enabled ~__context ~self ~value:true

(** Issue an alert if /proc/sys/kernel/tainted indicates particular kernel
errors. Will send only one alert per reboot *)
let alert_if_kernel_broken =
let __context = Context.make "host_kernel_error_alert_startup_check" in
(* Only add an alert if
(a) an alert wasn't already issued for the currently booted kernel *)
let possible_alerts =
ref
( lazy
((* Check all the alerts since last reboot. Only done once at toolstack
startup, we track if alerts have been issued afterwards internally *)
let self = Helpers.get_localhost ~__context in
let boot_time =
Db.Host.get_other_config ~__context ~self
|> List.assoc "boot_time"
|> float_of_string
in
let all_alerts =
[
(* processor reported a Machine Check Exception (MCE) *)
(4, Api_messages.kernel_is_broken "MCE")
; (* bad page referenced or some unexpected page flags *)
(5, Api_messages.kernel_is_broken "BAD_PAGE")
; (* kernel died recently, i.e. there was an OOPS or BUG *)
(7, Api_messages.kernel_is_broken "BUG")
; (* kernel issued warning *)
(9, Api_messages.kernel_is_broken_warning "WARN")
; (* soft lockup occurred *)
(14, Api_messages.kernel_is_broken_warning "SOFT_LOCKUP")
]
in
all_alerts
|> List.filter (fun (_, alert_message) ->
let alert_already_issued_for_this_boot =
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Message.get_all_records ~rpc ~session_id
|> List.exists (fun (_, record) ->
record.API.message_name = fst alert_message
&& API.Date.is_later
~than:(API.Date.of_unix_time boot_time)
record.API.message_timestamp
)
)
in
alert_already_issued_for_this_boot
)
)
)
in
(* and (b) if we found a problem *)
fun ~__context ->
let self = Helpers.get_localhost ~__context in
possible_alerts :=
Lazy.from_val
(Lazy.force !possible_alerts
|> List.filter (fun (alert_bit, alert_message) ->
let is_bit_tainted =
Unixext.string_of_file "/proc/sys/kernel/tainted"
|> int_of_string
in
let is_bit_tainted = (is_bit_tainted lsr alert_bit) land 1 = 1 in
if is_bit_tainted then (
let host = Db.Host.get_name_label ~__context ~self in
let body =
Printf.sprintf "<body><host>%s</host></body>" host
in
Xapi_alert.add ~msg:alert_message ~cls:`Host
~obj_uuid:(Db.Host.get_uuid ~__context ~self)
~body ;
false (* alert issued, remove from the list *)
) else
true (* keep in the list, alert can be issued later *)
)
)

let alert_if_tls_verification_was_emergency_disabled ~__context =
let tls_verification_enabled_locally =
Stunnel_client.get_verify_by_default ()
Expand Down
2 changes: 2 additions & 0 deletions ocaml/xapi/xapi_host.mli
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,8 @@ val set_numa_affinity_policy :

val emergency_disable_tls_verification : __context:Context.t -> unit

val alert_if_kernel_broken : __context:Context.t -> unit

val alert_if_tls_verification_was_emergency_disabled :
__context:Context.t -> unit

Expand Down
7 changes: 7 additions & 0 deletions ocaml/xapi/xapi_periodic_scheduler_init.ml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ let register ~__context =
(Xapi_periodic_scheduler.Periodic freq) freq
Xapi_pool.alert_failed_login_attempts
) ;
Xapi_periodic_scheduler.add_to_queue "broken_kernel"
(Xapi_periodic_scheduler.Periodic 600.) 600. (fun () ->
Server_helpers.exec_with_new_task
"Periodic alert if the running kernel is broken in some serious way."
(fun __context -> Xapi_host.alert_if_kernel_broken ~__context
)
) ;
Xapi_periodic_scheduler.add_to_queue
"Period alert if TLS verification emergency disabled"
(Xapi_periodic_scheduler.Periodic 600.) 600. (fun () ->
Expand Down
12 changes: 6 additions & 6 deletions ocaml/xenopsd/xc/domain.ml
Original file line number Diff line number Diff line change
Expand Up @@ -835,12 +835,12 @@ let create_channels ~xc uuid domid =
let numa_hierarchy =
let open Xenctrlext in
let open Topology in
Lazy.from_fun (fun () ->
let xcext = get_handle () in
let distances = (numainfo xcext).distances in
let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in
NUMA.make ~distances ~cpu_to_node
)
lazy
(let xcext = get_handle () in
let distances = (numainfo xcext).distances in
let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in
NUMA.make ~distances ~cpu_to_node
)

let numa_mutex = Mutex.create ()

Expand Down
2 changes: 1 addition & 1 deletion quality-gate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

list-hd () {
N=294
N=293
LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc)
if [ "$LIST_HD" -eq "$N" ]; then
echo "OK counted $LIST_HD List.hd usages"
Expand Down

0 comments on commit b782202

Please sign in to comment.