Skip to content

Commit

Permalink
feat: DEVOPS-1801 introduce persistence node (#2083)
Browse files Browse the repository at this point in the history
  • Loading branch information
frankmeds authored Dec 28, 2024
1 parent eac2492 commit a8bed8b
Show file tree
Hide file tree
Showing 15 changed files with 172 additions and 75 deletions.
3 changes: 2 additions & 1 deletion infra/tf/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ resource "google_storage_bucket_iam_binding" "persistence_bucket_admins" {
"serviceAccount:${module.bootstraps.service_account.email}",
"serviceAccount:${module.validators.service_account.email}",
"serviceAccount:${module.apis.service_account.email}",
"serviceAccount:${module.checkpoints.service_account.email}"
"serviceAccount:${module.checkpoints.service_account.email}",
"serviceAccount:${module.persistences.service_account.email}"
]
}

Expand Down
17 changes: 9 additions & 8 deletions infra/tf/modules/node/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ variable "node_role_mappings" {
description = "(Optional) The node role short names"
type = map(string)
default = {
apps = "app",
api = "api",
bootstrap = "boo",
validator = "val",
checkpoint = "che",
sentry = "sen",
apps = "app",
api = "api",
bootstrap = "boo",
validator = "val",
checkpoint = "che",
persistence = "per",
sentry = "sen",
}
}

Expand Down Expand Up @@ -69,8 +70,8 @@ variable "role" {
description = "VM role"
type = string
validation {
condition = contains(["bootstrap", "api", "validator", "apps", "checkpoint", "sentry"], var.role)
error_message = "The role value must be one of: 'bootstrap', 'api', 'validator', 'apps', 'checkpoint', 'sentry'."
condition = contains(["bootstrap", "api", "validator", "apps", "checkpoint", "persistence", "sentry"], var.role)
error_message = "The role value must be one of: 'bootstrap', 'api', 'validator', 'apps', 'checkpoint', 'persistence', 'sentry'."
}
}

Expand Down
36 changes: 36 additions & 0 deletions infra/tf/persistence.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
################################################################################
# ZQ2 GCP Terraform persistence resources
################################################################################

module "persistences" {
source = "./modules/node"

config = var.persistence
chain_name = var.chain_name

role = "persistence"
labels = local.labels
network_tags = []

metadata = {
subdomain = base64encode("")
}

node_dns_subdomain = var.node_dns_subdomain
node_dns_zone_project_id = var.node_dns_zone_project_id

service_account_iam = local.default_service_account_iam
}

resource "google_compute_instance_group" "persistence" {
for_each = toset(module.persistences.zones)

name = "${var.chain_name}-persistence-${each.key}"
zone = each.key
instances = [for instance in module.persistences.instances : instance.self_link if instance.zone == each.key]

named_port {
name = "jsonrpc"
port = "4201"
}
}
37 changes: 37 additions & 0 deletions infra/tf/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,43 @@ variable "checkpoint" {
}
}

variable "persistence" {
description = "(Optional) The configuration of the persistence nodes"
type = object({
disk_size = optional(number, 256)
instance_type = optional(string, "e2-standard-2")
provisioning_model = optional(string, "STANDARD")
generate_external_ip = optional(bool, false)
nodes = list(object({
count = number
region = optional(string)
zone = optional(string)
}))
})
default = {
nodes : [
{
count = 1
region = "asia-southeast1"
}
]
}

# Validation for provisioning_model
validation {
condition = contains(["STANDARD", "SPOT"], var.persistence.provisioning_model)
error_message = "Provisioning model must be one of 'STANDARD' or 'SPOT'."
}

# Validation to check that both 'region' and 'zone' are not specified together
validation {
condition = alltrue([
for node in var.persistence.nodes : (node.region != null && node.zone == null) || (node.region == null && node.zone != null)
])
error_message = "You need to specify either 'region' or 'zone' for a node."
}
}

variable "node_dns_subdomain" {
description = "Nodes DNS zone name"
type = string
Expand Down
26 changes: 14 additions & 12 deletions z2/docs/deployer.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,13 @@ Options:
Virtual Machine roles

Possible values:
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- sentry: Virtual machine sentry
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- persistence: Virtual machine persistence
- sentry: Virtual machine sentry

-v, --verbose...
Increase logging verbosity
Expand Down Expand Up @@ -421,12 +422,13 @@ Options:
Node role. Default: validator
Possible values:
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- sentry: Virtual machine sentry
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- persistence: Virtual machine persistence
- sentry: Virtual machine sentry
-v, --verbose...
Increase logging verbosity
Expand Down
5 changes: 4 additions & 1 deletion z2/resources/node_provision.tera.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,9 @@ def go(role):
install_docker()
install_ops_agent()
install_gcloud()
login_registry()
match role:
case "bootstrap" | "checkpoint":
case "bootstrap" | "checkpoint" | "persistence":
log("Configuring a validator node")
configure_logrotate()
pull_zq2_image()
Expand Down Expand Up @@ -555,6 +556,8 @@ def install_gcloud():
run_or_die(sudo_noninteractive_apt_env(["apt", "update"]))
run_or_die(sudo_noninteractive_apt_env(["sudo","apt", "install", "-y", "google-cloud-cli" ]))

def login_registry():
run_or_die(["sudo", "bash", "-c", "gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin https://asia-docker.pkg.dev" ])

def create_zq2_start_script():
with open("/tmp/zq2.sh", "w") as f:
Expand Down
65 changes: 31 additions & 34 deletions z2/resources/persistence_export.tera.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#!/bin/bash

# Directory where checkpoint files are located
CHECKPOINT_DIR="/data/{{ eth_chain_id }}/checkpoints"

# GCS bucket where the persistence will be exported
GCS_BUCKET="gs://{{ network_name }}-persistence"

Expand All @@ -24,34 +21,34 @@ is_dir_empty() {
# Start the persistence export process
start_time=$(date +%s)

if is_dir_empty "$CHECKPOINT_DIR"; then
# Stop zilliqa service
if ! sudo systemctl stop zilliqa.service; then
log_message "Error: Failed to stop zilliqa service"
exit 1
fi
# Create persistence export folder name with timestamp
persistence_export_name="{{ network_name }}-$(date +%Y%m%d%H%M%S)-persistence"
log_message "Creating persistence export: $persistence_export_name"
# Upload to GCS
if ! gsutil -m cp -r /data "$GCS_BUCKET/$persistence_export_name/"; then
log_message "Error: Failed to upload data to GCS"
sudo systemctl start zilliqa.service
exit 1
fi
# Start zilliqa service
if ! sudo systemctl start zilliqa.service; then
log_message "Error: Failed to start zilliqa service"
exit 1
fi
# Calculate and log total execution time
end_time=$(date +%s)
duration=$((end_time - start_time))
log_message "Persistence export completed successfully in $duration seconds"
else
log_message "Checkpoint files present in $CHECKPOINT_DIR. Skipping persistence export."
fi
# Stop zilliqa service
if ! sudo systemctl stop zilliqa.service; then
log_message "Error: Failed to stop zilliqa service"
exit 1
fi

# Create persistence export folder name with timestamp
persistence_export_name="{{ eth_chain_id }}-persistence-$(date +%Y%m%d)-$(date +%H%M%S)"
log_message "Creating persistence export: $persistence_export_name"

# Upload to GCS
if ! gsutil -m cp -r /data "$GCS_BUCKET/$persistence_export_name/"; then
log_message "Error: Failed to upload data to GCS"
sudo systemctl start zilliqa.service
exit 1
fi

# Start zilliqa service
if ! sudo systemctl start zilliqa.service; then
log_message "Error: Failed to start zilliqa service"
exit 1
fi

# Calculate and log total execution time
end_time=$(date +%s)
duration=$((end_time - start_time))
log_message "Persistence export completed successfully in $duration seconds"

# Keep only the most recent 30 checkpoints in the GCS bucket
gsutil ls -d "$GCS_BUCKET/{{ eth_chain_id }}-persistence-*/" | sort -r | tail -n +101 | awk '{print $1}' | xargs -I {} gsutil rm -rfa {}
log_message "Cleanup completed"
51 changes: 32 additions & 19 deletions z2/src/chain/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ pub enum NodeRole {
Apps,
/// Virtual machine checkpoint
Checkpoint,
/// Virtual machine persistence
Persistence,
/// Virtual machine sentry
Sentry,
}
Expand All @@ -108,6 +110,7 @@ impl FromStr for NodeRole {
"apps" => Ok(NodeRole::Apps),
"validator" => Ok(NodeRole::Validator),
"checkpoint" => Ok(NodeRole::Checkpoint),
"persistence" => Ok(NodeRole::Persistence),
"sentry" => Ok(NodeRole::Sentry),
_ => Err(anyhow!("Node role not supported")),
}
Expand All @@ -122,6 +125,7 @@ impl fmt::Display for NodeRole {
NodeRole::Apps => write!(f, "apps"),
NodeRole::Validator => write!(f, "validator"),
NodeRole::Checkpoint => write!(f, "checkpoint"),
NodeRole::Persistence => write!(f, "persistence"),
NodeRole::Sentry => write!(f, "sentry"),
}
}
Expand Down Expand Up @@ -551,7 +555,9 @@ impl ChainNode {
self.machine
.copy(&[checkpoint_cron_job], "/tmp/checkpoint_cron_job.sh")
.await?;
}

if self.role == NodeRole::Persistence {
let temp_persistence_export_cron_job = NamedTempFile::new()?;
let persistence_export_cron_job = &self
.create_persistence_export_cron_job(
Expand Down Expand Up @@ -581,21 +587,19 @@ impl ChainNode {
}

if self.role == NodeRole::Checkpoint {
let cmd_checkpoint_cron_job = "sudo rm -f /tmp/checkpoint_cron_job.sh";
let output_cmd_checkpoint_cron_job =
self.machine.run(cmd_checkpoint_cron_job, true).await?;
if !output_cmd_checkpoint_cron_job.success {
println!("{:?}", output_cmd_checkpoint_cron_job.stderr);
let cmd = "sudo rm -f /tmp/checkpoint_cron_job.sh";
let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!("Error removing previous checkpoint cron job"));
}
}

let cmd_persistence_export_cron_job = "sudo rm -f /tmp/persistence_export_cron_job.sh";
let output_persistence_export_cron_job = self
.machine
.run(cmd_persistence_export_cron_job, true)
.await?;
if !output_persistence_export_cron_job.success {
println!("{:?}", output_persistence_export_cron_job.stderr);
if self.role == NodeRole::Persistence {
let cmd = "sudo rm -f /tmp/persistence_export_cron_job.sh";
let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!(
"Error removing previous persistence export cron job"
));
Expand All @@ -619,14 +623,25 @@ impl ChainNode {
let cmd = r#"
sudo chmod 777 /tmp/checkpoint_cron_job.sh && \
sudo mv /tmp/checkpoint_cron_job.sh /checkpoint_cron_job.sh && \
echo '*/30 * * * * /checkpoint_cron_job.sh' | sudo crontab -"#;

let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!("Error creating the checkpoint cronjob"));
}
}

if self.role == NodeRole::Persistence {
let cmd = r#"
sudo chmod 777 /tmp/persistence_export_cron_job.sh && \
sudo mv /tmp/persistence_export_cron_job.sh /persistence_export_cron_job.sh && \
(echo '*/5 * * * * /checkpoint_cron_job.sh'; echo '0 */6 * * * /persistence_export_cron_job.sh') | sudo crontab -"#;
echo '0 */2 * * * /persistence_export_cron_job.sh' | sudo crontab -"#;

let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!("Error setting up cron jobs"));
return Err(anyhow!("Error creating the persistence export cronjob"));
}
}

Expand Down Expand Up @@ -942,7 +957,7 @@ impl ChainNode {
// export the backup files
progress_bar.start("Exporting the backup files");
let command = format!(
"sudo gsutil -m cp -r /data gs://{}-persistence/{}",
"sudo gsutil -m cp -r /data gs://{}-persistence/{}/",
self.chain()?,
backup_name
);
Expand Down Expand Up @@ -1016,15 +1031,13 @@ impl ChainNode {
} else {
// delete the data folder
progress_bar.start(format!("{}: Deleting the data folder", self.name()));
machine
.run("sudo rm -rf /data && sudo mkdir -p /data", false)
.await?;
machine.run("sudo rm -rf /data", false).await?;
progress_bar.inc(1);

// import the backup files
progress_bar.start(format!("{}: Importing the backup files", self.name()));
let command = format!(
"sudo gsutil -m cp -r gs://{}-persistence/{}/* /data",
"sudo gsutil -m cp -r gs://{}-persistence/{}/* /",
self.chain()?,
backup_name
);
Expand Down
1 change: 1 addition & 0 deletions zq2-devnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ roles:
- api
- apps
- checkpoint
- persistence
versions:
zq2: v0.4.0
otterscan: develop
Expand Down
1 change: 1 addition & 0 deletions zq2-infratest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ roles:
- api
- apps
- checkpoint
- persistence
versions:
zq2: 30a24610
otterscan: develop
Expand Down
Loading

0 comments on commit a8bed8b

Please sign in to comment.