Skip to content

Commit

Permalink
CA-401242: avoid long-running, idle connections on VDI.pool_migrate (#…
Browse files Browse the repository at this point in the history
…6102)

When a VDI.pool_migrate starts at a pool member, a connection between
the
coordinator and that host remains open for the duration of the
migration. This
connection is completely idle. If it's open for 12 hours, stunnel closes
the
connection due to inactivity, which cancels the migration.

To avoid this use an internal API that uses short-running connection
whenever
possible to avoid interrupting the migration.

Also change nested if-else flow in vdi handling to use Result with let
binds, and add missing CDI operations to storage operations.

I've manually verified the fix by changing the stunnel timeout in the
coordinator of a pool to 5 seconds and migrating a vdi used by a VM in
another host:
```
# xe vdi-pool-migrate uuid=fdb8783f-8cdb-44d3-9327-992f047e6262 sr-uuid=c879ffc9-bb14-fec5-3d95-8959db9337eb
270c3dc8-27bc-4ef2-982a-a6142f0ffce1
```

And after reverting the patch:
```
# xe vdi-pool-migrate uuid=270c3dc8-27bc-4ef2-982a-a6142f0ffce1 sr-uuid=aade9dc6-ff31-3204-4187-37f82ad06c77
Cannot forward messages because the server cannot be contacted. The server may be switched off or there may be network connectivity problems.
host: 0a3a45a0-8754-4570-bfde-6ef6843ccda1 (xs2)
```

The changes are best reviewed one by one, and while ignoring whitespace,
if using github.
  • Loading branch information
psafont authored Nov 5, 2024
2 parents 8e89449 + e40b3fc commit 3a776c0
Show file tree
Hide file tree
Showing 9 changed files with 472 additions and 447 deletions.
7 changes: 7 additions & 0 deletions ocaml/idl/datamodel.ml
Original file line number Diff line number Diff line change
Expand Up @@ -4181,6 +4181,13 @@ module SR = struct
, "Exporting a bitmap that shows the changed blocks between two VDIs"
)
; ("vdi_set_on_boot", "Setting the on_boot field of the VDI")
; ("vdi_blocked", "Blocking other operations for a VDI")
; ("vdi_copy", "Copying the VDI")
; ("vdi_force_unlock", "Forcefully unlocking the VDI")
; ("vdi_forget", "Forgetting about the VDI")
; ("vdi_generate_config", "Generating the configuration of the VDI")
; ("vdi_resize_online", "Resizing the VDI online")
; ("vdi_update", "Refreshing the fields on the VDI")
; ("pbd_create", "Creating a PBD for this SR")
; ("pbd_destroy", "Destroying one of this SR's PBDs")
]
Expand Down
2 changes: 1 addition & 1 deletion ocaml/idl/datamodel_common.ml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ open Datamodel_roles
to leave a gap for potential hotfixes needing to increment the schema version.*)
let schema_major_vsn = 5

let schema_minor_vsn = 783
let schema_minor_vsn = 784

(* Historical schema versions just in case this is useful later *)
let rio_schema_major_vsn = 5
Expand Down
2 changes: 1 addition & 1 deletion ocaml/idl/schematest.ml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ let hash x = Digest.string x |> Digest.to_hex
(* BEWARE: if this changes, check that schema has been bumped accordingly in
ocaml/idl/datamodel_common.ml, usually schema_minor_vsn *)

let last_known_schema_hash = "8fcd8892ec0c7d130b0da44c5fd3990b"
let last_known_schema_hash = "b427bac09aca4eabc9407738a9155326"

let current_schema_hash : string =
let open Datamodel_types in
Expand Down
15 changes: 15 additions & 0 deletions ocaml/tests/record_util/old_record_util.ml
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,21 @@ let sr_operation_to_string : API.storage_operations -> string = function
"PBD.create"
| `pbd_destroy ->
"PBD.destroy"
(* The following ones were added after the file got introduced *)
| `vdi_blocked ->
"VDI.blocked"
| `vdi_copy ->
"VDI.copy"
| `vdi_force_unlock ->
"VDI.force_unlock"
| `vdi_forget ->
"VDI.forget"
| `vdi_generate_config ->
"VDI.generate_config"
| `vdi_resize_online ->
"VDI.resize_online"
| `vdi_update ->
"VDI.update"

let vbd_operation_to_string = function
| `attach ->
Expand Down
79 changes: 39 additions & 40 deletions ocaml/tests/test_vdi_allowed_operations.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ let setup_test ~__context ?sm_fun ?vdi_fun () =
(vdi_ref, vdi_record)

let check_same_error_code =
let open Alcotest in
let open Alcotest_comparators in
check (option error_code) "Same error code"
Alcotest.(check @@ result unit Alcotest_comparators.error_code)
"Same error code"

let run_assert_equal_with_vdi ~__context ?(ha_enabled = false) ?sm_fun ?vdi_fun
op exc =
Expand All @@ -52,7 +51,7 @@ let test_ca98944 () =
()
)
`update
(Some (Api_errors.vdi_in_use, [])) ;
(Error (Api_errors.vdi_in_use, [])) ;
(* Should raise vdi_in_use *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
Expand All @@ -61,7 +60,7 @@ let test_ca98944 () =
()
)
`update
(Some (Api_errors.vdi_in_use, [])) ;
(Error (Api_errors.vdi_in_use, [])) ;
(* Should raise vdi_in_use *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
Expand All @@ -70,7 +69,7 @@ let test_ca98944 () =
()
)
`update
(Some (Api_errors.vdi_in_use, [])) ;
(Error (Api_errors.vdi_in_use, [])) ;
(* Should raise other_operation_in_progress *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
Expand All @@ -79,14 +78,14 @@ let test_ca98944 () =
()
)
`update
(Some (Api_errors.other_operation_in_progress, [])) ;
(Error (Api_errors.other_operation_in_progress, [])) ;
(* Should pass *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
make_vbd ~vDI:vdi_ref ~__context ~reserved:false ~currently_attached:false
~current_operations:[] ()
)
`forget None
`forget (Ok ())

(* VDI.copy should be allowed if all attached VBDs are read-only. *)
let test_ca101669 () =
Expand All @@ -97,15 +96,15 @@ let test_ca101669 () =
make_vbd ~__context ~vDI:vdi_ref ~currently_attached:true ~mode:`RW ()
)
`copy
(Some (Api_errors.vdi_in_use, [])) ;
(Error (Api_errors.vdi_in_use, [])) ;
(* Attempting to copy a RO-attached VDI should pass. *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
make_vbd ~__context ~vDI:vdi_ref ~currently_attached:true ~mode:`RO ()
)
`copy None ;
`copy (Ok ()) ;
(* Attempting to copy an unattached VDI should pass. *)
run_assert_equal_with_vdi ~__context `copy None ;
run_assert_equal_with_vdi ~__context `copy (Ok ()) ;
(* Attempting to copy RW- and RO-attached VDIs should fail with VDI_IN_USE. *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
Expand All @@ -115,7 +114,7 @@ let test_ca101669 () =
make_vbd ~__context ~vDI:vdi_ref ~currently_attached:true ~mode:`RO ()
)
`copy
(Some (Api_errors.vdi_in_use, []))
(Error (Api_errors.vdi_in_use, []))

let test_ca125187 () =
let __context = Test_common.make_test_database () in
Expand All @@ -128,7 +127,7 @@ let test_ca125187 () =
Db.VDI.set_current_operations ~__context ~self:vdi_ref
~value:[("mytask", `copy)]
)
`copy None ;
`copy (Ok ()) ;
(* A VBD can be plugged to a VDI which is being copied. This is required as
* the VBD is plugged after the VDI is marked with the copy operation. *)
let _, _ =
Expand Down Expand Up @@ -162,7 +161,7 @@ let test_ca126097 () =
Db.VDI.set_current_operations ~__context ~self:vdi_ref
~value:[("mytask", `copy)]
)
`clone None ;
`clone (Ok ()) ;
(* Attempting to snapshot a VDI being copied should be allowed. *)
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi_ref ->
Expand All @@ -173,7 +172,7 @@ let test_ca126097 () =
~value:[("mytask", `copy)]
)
`snapshot
(Some (Api_errors.operation_not_allowed, []))
(Error (Api_errors.operation_not_allowed, []))

(** Tests for the checks related to changed block tracking *)
let test_cbt =
Expand All @@ -189,7 +188,7 @@ let test_cbt =
Db.SM.remove_from_features ~__context ~self:sm ~key:"VDI_CONFIG_CBT"
)
op
(Some (Api_errors.sr_operation_not_supported, []))
(Error (Api_errors.sr_operation_not_supported, []))
in
let test_sm_feature_check =
for_vdi_operations all_cbt_operations test_sm_feature_check
Expand All @@ -202,7 +201,7 @@ let test_cbt =
Db.VDI.set_is_a_snapshot ~__context ~self:vdi ~value:true
)
op
(Some (Api_errors.operation_not_allowed, []))
(Error (Api_errors.operation_not_allowed, []))
)
in
let test_cbt_enable_disable_vdi_type_check =
Expand All @@ -213,21 +212,21 @@ let test_cbt =
Db.VDI.set_type ~__context ~self:vdi ~value:`metadata
)
op
(Some (Api_errors.vdi_incompatible_type, [])) ;
(Error (Api_errors.vdi_incompatible_type, [])) ;
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi ->
Db.VDI.set_type ~__context ~self:vdi ~value:`redo_log
)
op
(Some (Api_errors.vdi_incompatible_type, [])) ;
(Error (Api_errors.vdi_incompatible_type, [])) ;
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi -> Db.VDI.set_type ~__context ~self:vdi ~value:`user)
op None ;
op (Ok ()) ;
run_assert_equal_with_vdi ~__context
~vdi_fun:(fun vdi ->
Db.VDI.set_type ~__context ~self:vdi ~value:`system
)
op None
op (Ok ())
)
in
let test_cbt_enable_disable_not_allowed_for_reset_on_boot =
Expand All @@ -238,7 +237,7 @@ let test_cbt =
Db.VDI.set_on_boot ~__context ~self:vdi ~value:`reset
)
op
(Some (Api_errors.vdi_on_boot_mode_incompatible_with_operation, []))
(Error (Api_errors.vdi_on_boot_mode_incompatible_with_operation, []))
)
in
let test_cbt_enable_disable_can_be_performed_live =
Expand All @@ -249,7 +248,7 @@ let test_cbt =
Test_common.make_vbd ~__context ~vDI:vdi ~currently_attached:true
~mode:`RW ()
)
op None
op (Ok ())
)
in
let test_cbt_metadata_vdi_type_check =
Expand All @@ -273,7 +272,7 @@ let test_cbt =
Db.VDI.set_type ~__context ~self:vdi ~value:`cbt_metadata
)
op
(Some (Api_errors.vdi_incompatible_type, []))
(Error (Api_errors.vdi_incompatible_type, []))
)
in
let test_vdi_cbt_enabled_check =
Expand All @@ -288,7 +287,7 @@ let test_cbt =
Db.VDI.set_cbt_enabled ~__context ~self:vdi ~value:true
)
op
(Some (Api_errors.vdi_cbt_enabled, []))
(Error (Api_errors.vdi_cbt_enabled, []))
)
in
let test_vdi_data_destroy () =
Expand All @@ -308,31 +307,31 @@ let test_cbt =
)
(* ensure VDI.data_destroy works before introducing errors *)
[
((fun vdi -> pass_data_destroy vdi), None)
((fun vdi -> pass_data_destroy vdi), Ok ())
; ( (fun vdi ->
pass_data_destroy vdi ;
Db.VDI.set_is_a_snapshot ~__context ~self:vdi ~value:false
)
, Some (Api_errors.operation_not_allowed, [])
, Error (Api_errors.operation_not_allowed, [])
)
; ( (fun vdi ->
pass_data_destroy vdi ;
let sr = Db.VDI.get_SR ~__context ~self:vdi in
Db.SR.set_is_tools_sr ~__context ~self:sr ~value:true
)
, Some (Api_errors.sr_operation_not_supported, [])
, Error (Api_errors.sr_operation_not_supported, [])
)
; ( (fun vdi ->
pass_data_destroy vdi ;
Db.VDI.set_cbt_enabled ~__context ~self:vdi ~value:false
)
, Some (Api_errors.vdi_no_cbt_metadata, [])
, Error (Api_errors.vdi_no_cbt_metadata, [])
)
; ( (fun vdi ->
pass_data_destroy vdi ;
Db.VDI.set_type ~__context ~self:vdi ~value:`cbt_metadata
)
, None
, Ok ()
)
; (* VDI.data_destroy should wait a bit for the VDIs to be unplugged and
destroyed, instead of failing immediately in check_operation_error,
Expand All @@ -346,7 +345,7 @@ let test_cbt =
in
pass_data_destroy vdi
)
, None
, Ok ()
)
; ( (fun vdi ->
(* Set up the fields corresponding to a VM snapshot *)
Expand All @@ -359,7 +358,7 @@ let test_cbt =
in
pass_data_destroy vdi
)
, None
, Ok ()
)
; ( (fun vdi ->
let vM = Test_common.make_vm ~__context () in
Expand All @@ -369,7 +368,7 @@ let test_cbt =
in
pass_data_destroy vdi
)
, None
, Ok ()
)
]
in
Expand All @@ -389,7 +388,7 @@ let test_cbt =
Db.VDI.set_cbt_enabled ~__context ~self:vDI ~value:true ;
Db.VDI.set_is_a_snapshot ~__context ~self:vDI ~value:true
)
, None
, Ok ()
)
in
List.iter
Expand All @@ -407,17 +406,17 @@ let test_cbt =
in
()
)
, Some (Api_errors.vdi_in_use, [])
, Error (Api_errors.vdi_in_use, [])
)
; (* positive test checks no errors thrown for cbt_metadata or cbt_enabled VDIs *)
( (fun vDI ->
Db.VDI.set_cbt_enabled ~__context ~self:vDI ~value:true ;
Db.VDI.set_type ~__context ~self:vDI ~value:`cbt_metadata
)
, None
, Ok ()
)
; ( (fun vDI -> Db.VDI.set_cbt_enabled ~__context ~self:vDI ~value:true)
, None
, Ok ()
)
; test_cbt_enabled_snapshot_vdi_linked_to_vm_snapshot
~vbd_currently_attached:false
Expand Down Expand Up @@ -467,14 +466,14 @@ let test_operations_restricted_during_rpu =
Db.SM.set_features ~__context ~self:sm ~value:[("VDI_MIRROR", 1L)]
)
`mirror
(Some (Api_errors.not_supported_during_upgrade, [])) ;
(Error (Api_errors.not_supported_during_upgrade, [])) ;
Db.Pool.remove_from_other_config ~__context ~self:pool
~key:Xapi_globs.rolling_upgrade_in_progress ;
run_assert_equal_with_vdi ~__context
~sm_fun:(fun sm ->
Db.SM.set_features ~__context ~self:sm ~value:[("VDI_MIRROR", 1L)]
)
`mirror None
`mirror (Ok ())
in
let test_update_allowed_operations () =
let __context = Mock.make_context_with_new_db "Mock context" in
Expand Down Expand Up @@ -523,7 +522,7 @@ let test_null_vm =
()
in
(* This shouldn't throw an exception *)
let (_ : _ option) =
let (_ : _ result) =
Xapi_vdi.check_operation_error ~__context false vdi_record vdi_ref op
in
()
Expand Down
14 changes: 14 additions & 0 deletions ocaml/xapi-cli-server/record_util.ml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,20 @@ let sr_operation_to_string : API.storage_operations -> string = function
"VDI.data_destroy"
| `vdi_list_changed_blocks ->
"VDI.list_changed_blocks"
| `vdi_blocked ->
"VDI.blocked"
| `vdi_copy ->
"VDI.copy"
| `vdi_force_unlock ->
"VDI.force_unlock"
| `vdi_forget ->
"VDI.forget"
| `vdi_generate_config ->
"VDI.generate_config"
| `vdi_resize_online ->
"VDI.resize_online"
| `vdi_update ->
"VDI.update"
| `pbd_create ->
"PBD.create"
| `pbd_destroy ->
Expand Down
Loading

0 comments on commit 3a776c0

Please sign in to comment.