Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CA-394109: Reduce number of alerts #5696

Merged
merged 6 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions ocaml/idl/datamodel_cluster.ml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,16 @@ let pool_resync =
~params:[(Ref _cluster, "self", "The cluster to resync")]
~lifecycle ~allowed_roles:_R_POOL_OP ~errs:[] ()

let cstack_sync =
call ~name:"cstack_sync"
~doc:
"Sync xapi db with the cluster stack synchronously, and generate alerts \
as needed. Only happens on the coordinator as this is where the cluster \
watcher performs updates."
~params:[(Ref _cluster, "self", "The cluster to sync")]
~hide_from_docs:true ~pool_internal:true ~lifecycle
~allowed_roles:_R_POOL_OP ~errs:[] ()

let t =
create_obj ~name:_cluster ~descr:"Cluster-wide Cluster metadata"
~doccomments:[] ~gen_constructor_destructor:false ~gen_events:true
Expand Down Expand Up @@ -245,5 +255,6 @@ let t =
; pool_force_destroy
; pool_destroy
; pool_resync
; cstack_sync
]
()
4 changes: 4 additions & 0 deletions ocaml/tests/test_cluster.ml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ let test_rpc ~__context call =
Rpc.{success= true; contents= Rpc.String ""; is_notification= false}
| "Cluster_host.get_cluster_config", _ ->
Rpc.{success= true; contents= Rpc.String ""; is_notification= false}
| "Cluster.cstack_sync", [_session; self] ->
let open API in
Xapi_cluster.cstack_sync ~__context ~self:(ref_Cluster_of_rpc self) ;
Rpc.{success= true; contents= Rpc.String ""; is_notification= false}
| name, params ->
Alcotest.failf "Unexpected RPC: %s(%s)" name
(String.concat " " (List.map Rpc.to_string params))
Expand Down
8 changes: 8 additions & 0 deletions ocaml/xapi/message_forwarding.ml
Original file line number Diff line number Diff line change
Expand Up @@ -6419,6 +6419,14 @@ functor
) ;
debug "Cluster.pool_resync for host %s" (Ref.string_of host)
)

let cstack_sync ~__context ~self =
info "Cluster.cstack_sync cluster %s" (Ref.string_of self) ;
let local_fn = Local.Cluster.cstack_sync ~self in
let coor = Helpers.get_master ~__context in
do_op_on ~local_fn ~__context ~host:coor (fun session_id rpc ->
Client.Cluster.cstack_sync ~rpc ~session_id ~self
)
end

module Cluster_host = struct
Expand Down
9 changes: 8 additions & 1 deletion ocaml/xapi/xapi_cluster.ml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ let create ~__context ~pIF ~cluster_stack ~pool_auto_join ~token_timeout
~verify ;
(* Create the watcher here in addition to resync_host since pool_create
in resync_host only calls cluster_host.create for pool member nodes *)
create_cluster_watcher_on_master ~__context ~host ;
Watcher.create_as_necessary ~__context ~host ;
Xapi_cluster_host_helpers.update_allowed_operations ~__context
~self:cluster_host_ref ;
D.debug "Created Cluster: %s and Cluster_host: %s"
Expand Down Expand Up @@ -294,3 +294,10 @@ let pool_resync ~__context ~self:_ =
)
(* If host.clustering_enabled then resync_host should successfully
find or create a matching cluster_host which is also enabled *)

let cstack_sync ~__context ~self =
if Xapi_cluster_helpers.cluster_health_enabled ~__context then (
debug "%s: sync db data with cluster stack" __FUNCTION__ ;
Watcher.on_corosync_update ~__context ~cluster:self
["Updates due to cluster api calls"]
)
10 changes: 10 additions & 0 deletions ocaml/xapi/xapi_cluster.mli
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,13 @@ val pool_resync : __context:Context.t -> self:API.ref_Cluster -> unit
Cluster_host objects (ie., one for each host in the pool if the Cluster
has [pool_auto_join] set. If there is a failure, this function must return
an error that enables the administrator to fix the problem. *)

val cstack_sync : __context:Context.t -> self:API.ref_Cluster -> unit
(** [cstack_sync ~__context ~self] is the implementation of the internal XenAPI method,
which synchronously performs a diagnostics call to xapi-clusterd and updates the
xapi db according to the call. This is used internally by cluster-host-create/destroy
to generate the correct alert as a result of the API call. The other part of the
alerts generated due to network failure (e.g. a host left as its network is down)
is handled by the cluster watcher. This call only happens on the coordinator as that
is where the cluster watcher performs the updates, which shares the code with
this function. *)
8 changes: 4 additions & 4 deletions ocaml/xapi/xapi_cluster_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ let corosync3_enabled ~__context =
let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in
List.assoc_opt "restrict_corosync3" restrictions = Some "false"

let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
let maybe_generate_alert ~__context ~num_hosts ~hosts_left ~hosts_joined ~quorum
=
let generate_alert join cluster_host =
let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in
Expand Down Expand Up @@ -148,10 +148,10 @@ let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
)
in
if cluster_health_enabled ~__context then (
List.iter (generate_alert false) missing_hosts ;
List.iter (generate_alert true) new_hosts ;
List.iter (generate_alert false) hosts_left ;
List.iter (generate_alert true) hosts_joined ;
(* only generate this alert when the number of hosts is decreasing *)
if missing_hosts <> [] && num_hosts <= quorum then
if hosts_left <> [] && num_hosts <= quorum then
let pool = Helpers.get_pool ~__context in
let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in
let name, priority = Api_messages.cluster_quorum_approaching_lost in
Expand Down
33 changes: 12 additions & 21 deletions ocaml/xapi/xapi_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
*)

open Xapi_clustering
open Xapi_cluster_helpers
open Ipaddr_rpc_type

module D = Debug.Make (struct let name = "xapi_cluster_host" end)
Expand Down Expand Up @@ -55,20 +54,6 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body
raise err
)

let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts =
let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in
let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in
let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in
maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum

let alert_for_cluster_host_leave ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host]
~new_hosts:[]

let alert_for_cluster_host_join ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[]
~new_hosts:[cluster_host]

(* Create xapi db object for cluster_host, resync_host calls clusterd *)
let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
with_clustering_lock __LOC__ (fun () ->
Expand All @@ -81,7 +66,6 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
~enabled:false ~current_operations:[] ~allowed_operations:[]
~other_config:[] ~joined:false ~live:false
~last_update_live:API.Date.epoch ;
alert_for_cluster_host_join ~__context ~cluster_host:ref ;
ref
)

Expand Down Expand Up @@ -232,7 +216,7 @@ let resync_host ~__context ~host =
(* If we have just joined, enable will prevent concurrent clustering ops *)
if not (Db.Cluster_host.get_joined ~__context ~self) then (
join_internal ~__context ~self ;
create_cluster_watcher_on_master ~__context ~host ;
Watcher.create_as_necessary ~__context ~host ;
Xapi_observer.initialise_observer ~__context
Xapi_observer_components.Xapi_clusterd
) else if Db.Cluster_host.get_enabled ~__context ~self then (
Expand Down Expand Up @@ -269,16 +253,21 @@ let destroy_op ~__context ~self ~force =
(Cluster_client.LocalClient.leave, "destroy")
in
let result = local_fn (rpc ~__context) dbg in
let cluster = Db.Cluster_host.get_cluster ~__context ~self in
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
| Ok () ->
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Cluster.cstack_sync ~rpc ~session_id ~self:cluster
) ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.%s was successful" fn_str ;
Xapi_clustering.Daemon.disable ~__context
| Error error ->
warn "Error occurred during Cluster_host.%s" fn_str ;
if force then (
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Cluster.cstack_sync ~rpc ~session_id ~self:cluster
) ;
let ref_str = Ref.string_of self in
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host %s force destroyed." ref_str
Expand Down Expand Up @@ -326,7 +315,9 @@ let forget ~__context ~self =
Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ;
(* must not disable the daemon here, because we declared another unreachable node dead,
* not the current one *)
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Helpers.call_api_functions ~__context (fun rpc session_id ->
Client.Client.Cluster.cstack_sync ~rpc ~session_id ~self:cluster
) ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.forget was successful"
| Error error ->
Expand Down Expand Up @@ -375,7 +366,7 @@ let enable ~__context ~self =
"Cluster_host.enable: xapi-clusterd not running - attempting to start" ;
Xapi_clustering.Daemon.enable ~__context
) ;
create_cluster_watcher_on_master ~__context ~host ;
Watcher.create_as_necessary ~__context ~host ;
Xapi_observer.initialise_observer ~__context
Xapi_observer_components.Xapi_clusterd ;
let verify = Stunnel_client.get_verify_by_default () in
Expand Down
Loading
Loading