diff --git a/ocaml/xapi-consts/api_messages.ml b/ocaml/xapi-consts/api_messages.ml index a81eb1a426..8e2ed38df7 100644 --- a/ocaml/xapi-consts/api_messages.ml +++ b/ocaml/xapi-consts/api_messages.ml @@ -360,6 +360,12 @@ let host_internal_certificate_expiring_07 = let failed_login_attempts = addMessage "FAILED_LOGIN_ATTEMPTS" 3L +let kernel_is_broken which = + addMessage ("HOST_KERNEL_ENCOUNTERED_ERROR_" ^ which) 2L + +let kernel_is_broken_warning which = + addMessage ("HOST_KERNEL_ENCOUNTERED_WARNING_" ^ which) 3L + let tls_verification_emergency_disabled = addMessage "TLS_VERIFICATION_EMERGENCY_DISABLED" 3L diff --git a/ocaml/xapi/dbsync_slave.ml b/ocaml/xapi/dbsync_slave.ml index 10678562e7..3ff89881de 100644 --- a/ocaml/xapi/dbsync_slave.ml +++ b/ocaml/xapi/dbsync_slave.ml @@ -63,21 +63,24 @@ let create_localhost ~__context info = in () -(* TODO cat /proc/stat for btime ? *) let get_start_time () = try - debug "Calculating boot time..." ; - let now = Unix.time () in - let uptime = Unixext.string_of_file "/proc/uptime" in - let uptime = String.trim uptime in - let uptime = String.split ' ' uptime in - let uptime = List.hd uptime in - let uptime = float_of_string uptime in - let boot_time = Date.of_unix_time (now -. uptime) in - debug " system booted at %s" (Date.to_rfc3339 boot_time) ; - boot_time + match + Unixext.string_of_file "/proc/stat" + |> String.trim + |> String.split '\n' + |> List.find (fun s -> String.starts_with ~prefix:"btime" s) + |> String.split ' ' + with + | _ :: btime :: _ -> + let boot_time = Date.of_unix_time (float_of_string btime) in + debug "%s: system booted at %s" __FUNCTION__ (Date.to_rfc3339 boot_time) ; + boot_time + | _ -> + failwith "Couldn't parse /proc/stat" with e -> - debug "Calculating boot time failed with '%s'" (ExnHelper.string_of_exn e) ; + debug "%s: Calculating boot time failed with '%s'" __FUNCTION__ + (ExnHelper.string_of_exn e) ; Date.epoch (* not sufficient just to fill in this data on create time [Xen caps may change if VT enabled in BIOS etc.] *) diff --git a/ocaml/xapi/xapi_host.ml b/ocaml/xapi/xapi_host.ml index 1b22a037b3..46e06a9b4d 100644 --- a/ocaml/xapi/xapi_host.ml +++ b/ocaml/xapi/xapi_host.ml @@ -2923,6 +2923,81 @@ let emergency_reenable_tls_verification ~__context = Helpers.touch_file Constants.verify_certificates_path ; Db.Host.set_tls_verification_enabled ~__context ~self ~value:true +(** Issue an alert if /proc/sys/kernel/tainted indicates particular kernel + errors. Will send only one alert per reboot *) +let alert_if_kernel_broken = + let __context = Context.make "host_kernel_error_alert_startup_check" in + (* Only add an alert if + (a) an alert wasn't already issued for the currently booted kernel *) + let possible_alerts = + ref + ( lazy + ((* Check all the alerts since last reboot. Only done once at toolstack + startup, we track if alerts have been issued afterwards internally *) + let self = Helpers.get_localhost ~__context in + let boot_time = + Db.Host.get_other_config ~__context ~self + |> List.assoc "boot_time" + |> float_of_string + in + let all_alerts = + [ + (* processor reported a Machine Check Exception (MCE) *) + (4, Api_messages.kernel_is_broken "MCE") + ; (* bad page referenced or some unexpected page flags *) + (5, Api_messages.kernel_is_broken "BAD_PAGE") + ; (* kernel died recently, i.e. there was an OOPS or BUG *) + (7, Api_messages.kernel_is_broken "BUG") + ; (* kernel issued warning *) + (9, Api_messages.kernel_is_broken_warning "WARN") + ; (* soft lockup occurred *) + (14, Api_messages.kernel_is_broken_warning "SOFT_LOCKUP") + ] + in + all_alerts + |> List.filter (fun (_, alert_message) -> + let alert_already_issued_for_this_boot = + Helpers.call_api_functions ~__context (fun rpc session_id -> + Client.Client.Message.get_all_records ~rpc ~session_id + |> List.exists (fun (_, record) -> + record.API.message_name = fst alert_message + && API.Date.is_later + ~than:(API.Date.of_unix_time boot_time) + record.API.message_timestamp + ) + ) + in + alert_already_issued_for_this_boot + ) + ) + ) + in + (* and (b) if we found a problem *) + fun ~__context -> + let self = Helpers.get_localhost ~__context in + possible_alerts := + Lazy.from_val + (Lazy.force !possible_alerts + |> List.filter (fun (alert_bit, alert_message) -> + let is_bit_tainted = + Unixext.string_of_file "/proc/sys/kernel/tainted" + |> int_of_string + in + let is_bit_tainted = (is_bit_tainted lsr alert_bit) land 1 = 1 in + if is_bit_tainted then ( + let host = Db.Host.get_name_label ~__context ~self in + let body = + Printf.sprintf "%s" host + in + Xapi_alert.add ~msg:alert_message ~cls:`Host + ~obj_uuid:(Db.Host.get_uuid ~__context ~self) + ~body ; + false (* alert issued, remove from the list *) + ) else + true (* keep in the list, alert can be issued later *) + ) + ) + let alert_if_tls_verification_was_emergency_disabled ~__context = let tls_verification_enabled_locally = Stunnel_client.get_verify_by_default () diff --git a/ocaml/xapi/xapi_host.mli b/ocaml/xapi/xapi_host.mli index bafc47f6de..f8fe73f837 100644 --- a/ocaml/xapi/xapi_host.mli +++ b/ocaml/xapi/xapi_host.mli @@ -540,6 +540,8 @@ val set_numa_affinity_policy : val emergency_disable_tls_verification : __context:Context.t -> unit +val alert_if_kernel_broken : __context:Context.t -> unit + val alert_if_tls_verification_was_emergency_disabled : __context:Context.t -> unit diff --git a/ocaml/xapi/xapi_periodic_scheduler_init.ml b/ocaml/xapi/xapi_periodic_scheduler_init.ml index 5b49ebcde5..d74b349e24 100644 --- a/ocaml/xapi/xapi_periodic_scheduler_init.ml +++ b/ocaml/xapi/xapi_periodic_scheduler_init.ml @@ -106,6 +106,13 @@ let register ~__context = (Xapi_periodic_scheduler.Periodic freq) freq Xapi_pool.alert_failed_login_attempts ) ; + Xapi_periodic_scheduler.add_to_queue "broken_kernel" + (Xapi_periodic_scheduler.Periodic 600.) 600. (fun () -> + Server_helpers.exec_with_new_task + "Periodic alert if the running kernel is broken in some serious way." + (fun __context -> Xapi_host.alert_if_kernel_broken ~__context + ) + ) ; Xapi_periodic_scheduler.add_to_queue "Period alert if TLS verification emergency disabled" (Xapi_periodic_scheduler.Periodic 600.) 600. (fun () -> diff --git a/ocaml/xenopsd/xc/domain.ml b/ocaml/xenopsd/xc/domain.ml index 7b31011aab..d33fc482e5 100644 --- a/ocaml/xenopsd/xc/domain.ml +++ b/ocaml/xenopsd/xc/domain.ml @@ -835,12 +835,12 @@ let create_channels ~xc uuid domid = let numa_hierarchy = let open Xenctrlext in let open Topology in - Lazy.from_fun (fun () -> - let xcext = get_handle () in - let distances = (numainfo xcext).distances in - let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in - NUMA.make ~distances ~cpu_to_node - ) + lazy + (let xcext = get_handle () in + let distances = (numainfo xcext).distances in + let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in + NUMA.make ~distances ~cpu_to_node + ) let numa_mutex = Mutex.create () diff --git a/quality-gate.sh b/quality-gate.sh index db8444b53e..b1d170041f 100755 --- a/quality-gate.sh +++ b/quality-gate.sh @@ -3,7 +3,7 @@ set -e list-hd () { - N=294 + N=293 LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc) if [ "$LIST_HD" -eq "$N" ]; then echo "OK counted $LIST_HD List.hd usages"