From 228071ae9896941bec3265f9fd0e45a38e6b9fa9 Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Mon, 18 Nov 2024 15:47:51 +0000 Subject: [PATCH 1/3] CP-52524 - dbsync_slave: stop calculating boot time ourselves Completes the 15+ years old TODO, at the expense of removing an ultimate example of a "not invented here" attitude. Signed-off-by: Andrii Sultanov --- ocaml/xapi/dbsync_slave.ml | 27 +++++++++++++++------------ quality-gate.sh | 2 +- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/ocaml/xapi/dbsync_slave.ml b/ocaml/xapi/dbsync_slave.ml index 942d308107..80793c0683 100644 --- a/ocaml/xapi/dbsync_slave.ml +++ b/ocaml/xapi/dbsync_slave.ml @@ -63,21 +63,24 @@ let create_localhost ~__context info = in () -(* TODO cat /proc/stat for btime ? *) let get_start_time () = try - debug "Calculating boot time..." ; - let now = Unix.time () in - let uptime = Unixext.string_of_file "/proc/uptime" in - let uptime = String.trim uptime in - let uptime = String.split ' ' uptime in - let uptime = List.hd uptime in - let uptime = float_of_string uptime in - let boot_time = Date.of_unix_time (now -. uptime) in - debug " system booted at %s" (Date.to_rfc3339 boot_time) ; - boot_time + match + Unixext.string_of_file "/proc/stat" + |> String.trim + |> String.split '\n' + |> List.find (fun s -> String.starts_with ~prefix:"btime" s) + |> String.split ' ' + with + | _ :: btime :: _ -> + let boot_time = Date.of_unix_time (float_of_string btime) in + debug "%s: system booted at %s" __FUNCTION__ (Date.to_rfc3339 boot_time) ; + boot_time + | _ -> + failwith "Couldn't parse /proc/stat" with e -> - debug "Calculating boot time failed with '%s'" (ExnHelper.string_of_exn e) ; + debug "%s: Calculating boot time failed with '%s'" __FUNCTION__ + (ExnHelper.string_of_exn e) ; Date.epoch (* not sufficient just to fill in this data on create time [Xen caps may change if VT enabled in BIOS etc.] *) diff --git a/quality-gate.sh b/quality-gate.sh index db8444b53e..b1d170041f 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -3,7 +3,7 @@ set -e list-hd () { - N=294 + N=293 LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc) if [ "$LIST_HD" -eq "$N" ]; then echo "OK counted $LIST_HD List.hd usages" From f0c9b4ce8c22302a213a64ef20d75ed78b7a06cf Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Tue, 19 Nov 2024 09:45:05 +0000 Subject: [PATCH 2/3] CP-52524: Generate an alert when various host kernel taints are set Issue an alert about a broken host kernel if bits 4, 5, 7, 9, or 14 are set in /proc/sys/kernel/tainted, indicating some kind of error was encountered and the future behaviour of the kernel might not be predictable or safe anymore (though it generally should reasonably recover). Only one alert per tainted bit per boot can be present (more than one can be issued, if the user dismissed the alerts and restarted the toolstack). Distinguish between Major (4,5,7 - these are all things that might cause a host crash, but are unlikely to corrupt whatever data has been written out) and Warning (9, 14 - might be a concern and could be raised to Support but usually are not severe enough to crash the host) levels of errors as suggested by the Foundations team. This should serve as an indicator during issue investigation to look for the cause of the taint. Signed-off-by: Andrii Sultanov --- ocaml/xapi-consts/api_messages.ml | 6 ++ ocaml/xapi/xapi_host.ml | 75 ++++++++++++++++++++++ ocaml/xapi/xapi_host.mli | 2 + ocaml/xapi/xapi_periodic_scheduler_init.ml | 7 ++ 4 files changed, 90 insertions(+) diff --git a/ocaml/xapi-consts/api_messages.ml b/ocaml/xapi-consts/api_messages.ml index ff436199a7..d5215415c1 100644 --- a/ocaml/xapi-consts/api_messages.ml +++ b/ocaml/xapi-consts/api_messages.ml @@ -360,6 +360,12 @@ let host_internal_certificate_expiring_07 = let failed_login_attempts = addMessage "FAILED_LOGIN_ATTEMPTS" 3L +let kernel_is_broken which = + addMessage ("HOST_KERNEL_ENCOUNTERED_ERROR_" ^ which) 2L + +let kernel_is_broken_warning which = + addMessage ("HOST_KERNEL_ENCOUNTERED_WARNING_" ^ which) 3L + let tls_verification_emergency_disabled = addMessage "TLS_VERIFICATION_EMERGENCY_DISABLED" 3L diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index 9f84923fe2..377d97d22f 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -2923,6 +2923,81 @@ let emergency_reenable_tls_verification ~__context = Helpers.touch_file Constants.verify_certificates_path ; Db.Host.set_tls_verification_enabled ~__context ~self ~value:true +(** Issue an alert if /proc/sys/kernel/tainted indicates particular kernel + errors. Will send only one alert per reboot *) +let alert_if_kernel_broken = + let __context = Context.make "host_kernel_error_alert_startup_check" in + (* Only add an alert if + (a) an alert wasn't already issued for the currently booted kernel *) + let possible_alerts = + ref + ( lazy + ((* Check all the alerts since last reboot. Only done once at toolstack + startup, we track if alerts have been issued afterwards internally *) + let self = Helpers.get_localhost ~__context in + let boot_time = + Db.Host.get_other_config ~__context ~self + |> List.assoc "boot_time" + |> float_of_string + in + let all_alerts = + [ + (* processor reported a Machine Check Exception (MCE) *) + (4, Api_messages.kernel_is_broken "MCE") + ; (* bad page referenced or some unexpected page flags *) + (5, Api_messages.kernel_is_broken "BAD_PAGE") + ; (* kernel died recently, i.e. there was an OOPS or BUG *) + (7, Api_messages.kernel_is_broken "BUG") + ; (* kernel issued warning *) + (9, Api_messages.kernel_is_broken_warning "WARN") + ; (* soft lockup occurred *) + (14, Api_messages.kernel_is_broken_warning "SOFT_LOCKUP") + ] + in + all_alerts + |> List.filter (fun (_, alert_message) -> + let alert_already_issued_for_this_boot = + Helpers.call_api_functions ~__context (fun rpc session_id -> + Client.Client.Message.get_all_records ~rpc ~session_id + |> List.exists (fun (_, record) -> + record.API.message_name = fst alert_message + && API.Date.is_later + ~than:(API.Date.of_unix_time boot_time) + record.API.message_timestamp + ) + ) + in + alert_already_issued_for_this_boot + ) + ) + ) + in + (* and (b) if we found a problem *) + fun ~__context -> + let self = Helpers.get_localhost ~__context in + possible_alerts := + Lazy.from_val + (Lazy.force !possible_alerts + |> List.filter (fun (alert_bit, alert_message) -> + let is_bit_tainted = + Unixext.string_of_file "/proc/sys/kernel/tainted" + |> int_of_string + in + let is_bit_tainted = (is_bit_tainted lsr alert_bit) land 1 = 1 in + if is_bit_tainted then ( + let host = Db.Host.get_name_label ~__context ~self in + let body = + Printf.sprintf "%s" host + in + Xapi_alert.add ~msg:alert_message ~cls:`Host + ~obj_uuid:(Db.Host.get_uuid ~__context ~self) + ~body ; + false (* alert issued, remove from the list *) + ) else + true (* keep in the list, alert can be issued later *) + ) + ) + let alert_if_tls_verification_was_emergency_disabled ~__context = let tls_verification_enabled_locally = Stunnel_client.get_verify_by_default () diff --git a/ocaml/xapi/xapi_host.mli b/ocaml/xapi/xapi_host.mli index c303ee6959..ec2e3b0fcf 100644 --- a/ocaml/xapi/xapi_host.mli +++ b/ocaml/xapi/xapi_host.mli @@ -539,6 +539,8 @@ val set_numa_affinity_policy : val emergency_disable_tls_verification : __context:Context.t -> unit +val alert_if_kernel_broken : __context:Context.t -> unit + val alert_if_tls_verification_was_emergency_disabled : __context:Context.t -> unit diff --git a/ocaml/xapi/xapi_periodic_scheduler_init.ml b/ocaml/xapi/xapi_periodic_scheduler_init.ml index 5b49ebcde5..d74b349e24 100644 --- a/ocaml/xapi/xapi_periodic_scheduler_init.ml +++ b/ocaml/xapi/xapi_periodic_scheduler_init.ml @@ -106,6 +106,13 @@ let register ~__context = (Xapi_periodic_scheduler.Periodic freq) freq Xapi_pool.alert_failed_login_attempts ) ; + Xapi_periodic_scheduler.add_to_queue "broken_kernel" + (Xapi_periodic_scheduler.Periodic 600.) 600. (fun () -> + Server_helpers.exec_with_new_task + "Periodic alert if the running kernel is broken in some serious way." + (fun __context -> Xapi_host.alert_if_kernel_broken ~__context + ) + ) ; Xapi_periodic_scheduler.add_to_queue "Period alert if TLS verification emergency disabled" (Xapi_periodic_scheduler.Periodic 600.) 600. (fun () -> From aaabb6cad3a01a5418344d95953e176ffd7d9f8c Mon Sep 17 00:00:00 2001 From: Andrii Sultanov Date: Fri, 22 Nov 2024 09:14:00 +0000 Subject: [PATCH 3/3] xenopsd: Optimize lazy evaluation The manual notes that `Lazy.from_fun` "should only be used if the function f is already defined. In particular it is always less efficient to write `from_fun (fun () -> expr)` than `lazy expr`. So, replace `Lazy.from_fun` with `lazy` in this particular case Compare the lambda dump for `lazy (fun () -> [| 1;2;3 |] |> Array.map (fun x -> x+1))`: ``` (seq (let (l/269 = (function param/319[int] (apply (field_imm 12 (global Stdlib__Array!)) (function x/318[int] : int (+ x/318 1)) (makearray[int] 1 2 3)))) (setfield_ptr(root-init) 0 (global Main!) l/269)) 0) ``` with the lambda dump of the `let x = Lazy.from_fun (fun () -> [| 1;2;3 |] |> Array.map (fun x -> x+1))`: ``` (seq (let (x/269 = (apply (field_imm 5 (global Stdlib__Lazy!)) (function param/332[int] (apply (field_imm 12 (global Stdlib__Array!)) (function x/331[int] : int (+ x/331 1)) (makearray[int] 1 2 3))))) (setfield_ptr(root-init) 0 (global Main!) x/269)) 0) ``` See: https://patricoferris.github.io/js_of_ocamlopt/#code=bGV0IGwgPSBsYXp5IChmdW4gKCkgLT4gW3wgMTsyOzMgfF0gfD4gQXJyYXkubWFwIChmdW4geCAtPiB4KzEpKQoKbGV0IHggPSBMYXp5LmZyb21fZnVuIChmdW4gKCkgLT4gW3wgMTsyOzMgfF0gfD4gQXJyYXkubWFwIChmdW4geCAtPiB4KzEpKQ%3D%3D Signed-off-by: Andrii Sultanov --- ocaml/xenopsd/xc/domain.ml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 7b31011aab..d33fc482e5 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -835,12 +835,12 @@ let create_channels ~xc uuid domid = let numa_hierarchy = let open Xenctrlext in let open Topology in - Lazy.from_fun (fun () -> - let xcext = get_handle () in - let distances = (numainfo xcext).distances in - let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in - NUMA.make ~distances ~cpu_to_node - ) + lazy + (let xcext = get_handle () in + let distances = (numainfo xcext).distances in + let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in + NUMA.make ~distances ~cpu_to_node + ) let numa_mutex = Mutex.create ()