Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: DEVOPS-1801 introduce persistence node #2083

Merged
merged 1 commit into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion infra/tf/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ resource "google_storage_bucket_iam_binding" "persistence_bucket_admins" {
"serviceAccount:${module.bootstraps.service_account.email}",
"serviceAccount:${module.validators.service_account.email}",
"serviceAccount:${module.apis.service_account.email}",
"serviceAccount:${module.checkpoints.service_account.email}"
"serviceAccount:${module.checkpoints.service_account.email}",
"serviceAccount:${module.persistences.service_account.email}"
]
}

Expand Down
17 changes: 9 additions & 8 deletions infra/tf/modules/node/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ variable "node_role_mappings" {
description = "(Optional) The node role short names"
type = map(string)
default = {
apps = "app",
api = "api",
bootstrap = "boo",
validator = "val",
checkpoint = "che",
sentry = "sen",
apps = "app",
api = "api",
bootstrap = "boo",
validator = "val",
checkpoint = "che",
persistence = "per",
sentry = "sen",
}
}

Expand Down Expand Up @@ -69,8 +70,8 @@ variable "role" {
description = "VM role"
type = string
validation {
condition = contains(["bootstrap", "api", "validator", "apps", "checkpoint", "sentry"], var.role)
error_message = "The role value must be one of: 'bootstrap', 'api', 'validator', 'apps', 'checkpoint', 'sentry'."
condition = contains(["bootstrap", "api", "validator", "apps", "checkpoint", "persistence", "sentry"], var.role)
error_message = "The role value must be one of: 'bootstrap', 'api', 'validator', 'apps', 'checkpoint', 'persistence', 'sentry'."
}
}

Expand Down
36 changes: 36 additions & 0 deletions infra/tf/persistence.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
################################################################################
# ZQ2 GCP Terraform persistence resources
################################################################################

module "persistences" {
source = "./modules/node"

config = var.persistence
chain_name = var.chain_name

role = "persistence"
labels = local.labels
network_tags = []

metadata = {
subdomain = base64encode("")
}

node_dns_subdomain = var.node_dns_subdomain
node_dns_zone_project_id = var.node_dns_zone_project_id

service_account_iam = local.default_service_account_iam
}

resource "google_compute_instance_group" "persistence" {
for_each = toset(module.persistences.zones)

name = "${var.chain_name}-persistence-${each.key}"
zone = each.key
instances = [for instance in module.persistences.instances : instance.self_link if instance.zone == each.key]

named_port {
name = "jsonrpc"
port = "4201"
}
}
37 changes: 37 additions & 0 deletions infra/tf/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,43 @@ variable "checkpoint" {
}
}

variable "persistence" {
description = "(Optional) The configuration of the persistence nodes"
type = object({
disk_size = optional(number, 256)
instance_type = optional(string, "e2-standard-2")
provisioning_model = optional(string, "STANDARD")
generate_external_ip = optional(bool, false)
nodes = list(object({
count = number
region = optional(string)
zone = optional(string)
}))
})
default = {
nodes : [
{
count = 1
region = "asia-southeast1"
}
]
}

# Validation for provisioning_model
validation {
condition = contains(["STANDARD", "SPOT"], var.persistence.provisioning_model)
error_message = "Provisioning model must be one of 'STANDARD' or 'SPOT'."
}

# Validation to check that both 'region' and 'zone' are not specified together
validation {
condition = alltrue([
for node in var.persistence.nodes : (node.region != null && node.zone == null) || (node.region == null && node.zone != null)
])
error_message = "You need to specify either 'region' or 'zone' for a node."
}
}

variable "node_dns_subdomain" {
description = "Nodes DNS zone name"
type = string
Expand Down
26 changes: 14 additions & 12 deletions z2/docs/deployer.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,13 @@ Options:
Virtual Machine roles

Possible values:
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- sentry: Virtual machine sentry
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- persistence: Virtual machine persistence
- sentry: Virtual machine sentry

-v, --verbose...
Increase logging verbosity
Expand Down Expand Up @@ -421,12 +422,13 @@ Options:
Node role. Default: validator

Possible values:
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- sentry: Virtual machine sentry
- bootstrap: Virtual machine bootstrap
- validator: Virtual machine validator
- api: Virtual machine api
- apps: Virtual machine apps
- checkpoint: Virtual machine checkpoint
- persistence: Virtual machine persistence
- sentry: Virtual machine sentry

-v, --verbose...
Increase logging verbosity
Expand Down
5 changes: 4 additions & 1 deletion z2/resources/node_provision.tera.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,9 @@ def go(role):
install_docker()
install_ops_agent()
install_gcloud()
login_registry()
match role:
case "bootstrap" | "checkpoint":
case "bootstrap" | "checkpoint" | "persistence":
log("Configuring a validator node")
configure_logrotate()
pull_zq2_image()
Expand Down Expand Up @@ -555,6 +556,8 @@ def install_gcloud():
run_or_die(sudo_noninteractive_apt_env(["apt", "update"]))
run_or_die(sudo_noninteractive_apt_env(["sudo","apt", "install", "-y", "google-cloud-cli" ]))

def login_registry():
run_or_die(["sudo", "bash", "-c", "gcloud auth print-access-token | docker login -u oauth2accesstoken --password-stdin https://asia-docker.pkg.dev" ])

def create_zq2_start_script():
with open("/tmp/zq2.sh", "w") as f:
Expand Down
65 changes: 31 additions & 34 deletions z2/resources/persistence_export.tera.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#!/bin/bash

# Directory where checkpoint files are located
CHECKPOINT_DIR="/data/{{ eth_chain_id }}/checkpoints"

# GCS bucket where the persistence will be exported
GCS_BUCKET="gs://{{ network_name }}-persistence"

Expand All @@ -24,34 +21,34 @@ is_dir_empty() {
# Start the persistence export process
start_time=$(date +%s)

if is_dir_empty "$CHECKPOINT_DIR"; then
# Stop zilliqa service
if ! sudo systemctl stop zilliqa.service; then
log_message "Error: Failed to stop zilliqa service"
exit 1
fi
# Create persistence export folder name with timestamp
persistence_export_name="{{ network_name }}-$(date +%Y%m%d%H%M%S)-persistence"
log_message "Creating persistence export: $persistence_export_name"
# Upload to GCS
if ! gsutil -m cp -r /data "$GCS_BUCKET/$persistence_export_name/"; then
log_message "Error: Failed to upload data to GCS"
sudo systemctl start zilliqa.service
exit 1
fi
# Start zilliqa service
if ! sudo systemctl start zilliqa.service; then
log_message "Error: Failed to start zilliqa service"
exit 1
fi
# Calculate and log total execution time
end_time=$(date +%s)
duration=$((end_time - start_time))
log_message "Persistence export completed successfully in $duration seconds"
else
log_message "Checkpoint files present in $CHECKPOINT_DIR. Skipping persistence export."
fi
# Stop zilliqa service
if ! sudo systemctl stop zilliqa.service; then
log_message "Error: Failed to stop zilliqa service"
exit 1
fi

# Create persistence export folder name with timestamp
persistence_export_name="{{ eth_chain_id }}-persistence-$(date +%Y%m%d)-$(date +%H%M%S)"
log_message "Creating persistence export: $persistence_export_name"

# Upload to GCS
if ! gsutil -m cp -r /data "$GCS_BUCKET/$persistence_export_name/"; then
log_message "Error: Failed to upload data to GCS"
sudo systemctl start zilliqa.service
exit 1
fi

# Start zilliqa service
if ! sudo systemctl start zilliqa.service; then
log_message "Error: Failed to start zilliqa service"
exit 1
fi

# Calculate and log total execution time
end_time=$(date +%s)
duration=$((end_time - start_time))
log_message "Persistence export completed successfully in $duration seconds"

# Keep only the most recent 30 checkpoints in the GCS bucket
gsutil ls -d "$GCS_BUCKET/{{ eth_chain_id }}-persistence-*/" | sort -r | tail -n +101 | awk '{print $1}' | xargs -I {} gsutil rm -rfa {}
log_message "Cleanup completed"
51 changes: 32 additions & 19 deletions z2/src/chain/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ pub enum NodeRole {
Apps,
/// Virtual machine checkpoint
Checkpoint,
/// Virtual machine persistence
Persistence,
/// Virtual machine sentry
Sentry,
}
Expand All @@ -108,6 +110,7 @@ impl FromStr for NodeRole {
"apps" => Ok(NodeRole::Apps),
"validator" => Ok(NodeRole::Validator),
"checkpoint" => Ok(NodeRole::Checkpoint),
"persistence" => Ok(NodeRole::Persistence),
"sentry" => Ok(NodeRole::Sentry),
_ => Err(anyhow!("Node role not supported")),
}
Expand All @@ -122,6 +125,7 @@ impl fmt::Display for NodeRole {
NodeRole::Apps => write!(f, "apps"),
NodeRole::Validator => write!(f, "validator"),
NodeRole::Checkpoint => write!(f, "checkpoint"),
NodeRole::Persistence => write!(f, "persistence"),
NodeRole::Sentry => write!(f, "sentry"),
}
}
Expand Down Expand Up @@ -551,7 +555,9 @@ impl ChainNode {
self.machine
.copy(&[checkpoint_cron_job], "/tmp/checkpoint_cron_job.sh")
.await?;
}

if self.role == NodeRole::Persistence {
let temp_persistence_export_cron_job = NamedTempFile::new()?;
let persistence_export_cron_job = &self
.create_persistence_export_cron_job(
Expand Down Expand Up @@ -581,21 +587,19 @@ impl ChainNode {
}

if self.role == NodeRole::Checkpoint {
let cmd_checkpoint_cron_job = "sudo rm -f /tmp/checkpoint_cron_job.sh";
let output_cmd_checkpoint_cron_job =
self.machine.run(cmd_checkpoint_cron_job, true).await?;
if !output_cmd_checkpoint_cron_job.success {
println!("{:?}", output_cmd_checkpoint_cron_job.stderr);
let cmd = "sudo rm -f /tmp/checkpoint_cron_job.sh";
let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!("Error removing previous checkpoint cron job"));
}
}

let cmd_persistence_export_cron_job = "sudo rm -f /tmp/persistence_export_cron_job.sh";
let output_persistence_export_cron_job = self
.machine
.run(cmd_persistence_export_cron_job, true)
.await?;
if !output_persistence_export_cron_job.success {
println!("{:?}", output_persistence_export_cron_job.stderr);
if self.role == NodeRole::Persistence {
let cmd = "sudo rm -f /tmp/persistence_export_cron_job.sh";
let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!(
"Error removing previous persistence export cron job"
));
Expand All @@ -619,14 +623,25 @@ impl ChainNode {
let cmd = r#"
sudo chmod 777 /tmp/checkpoint_cron_job.sh && \
sudo mv /tmp/checkpoint_cron_job.sh /checkpoint_cron_job.sh && \
echo '*/30 * * * * /checkpoint_cron_job.sh' | sudo crontab -"#;

let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!("Error creating the checkpoint cronjob"));
}
}

if self.role == NodeRole::Persistence {
let cmd = r#"
sudo chmod 777 /tmp/persistence_export_cron_job.sh && \
sudo mv /tmp/persistence_export_cron_job.sh /persistence_export_cron_job.sh && \
(echo '*/5 * * * * /checkpoint_cron_job.sh'; echo '0 */6 * * * /persistence_export_cron_job.sh') | sudo crontab -"#;
echo '0 */2 * * * /persistence_export_cron_job.sh' | sudo crontab -"#;

let output = self.machine.run(cmd, true).await?;
if !output.success {
println!("{:?}", output.stderr);
return Err(anyhow!("Error setting up cron jobs"));
return Err(anyhow!("Error creating the persistence export cronjob"));
}
}

Expand Down Expand Up @@ -942,7 +957,7 @@ impl ChainNode {
// export the backup files
progress_bar.start("Exporting the backup files");
let command = format!(
"sudo gsutil -m cp -r /data gs://{}-persistence/{}",
"sudo gsutil -m cp -r /data gs://{}-persistence/{}/",
self.chain()?,
backup_name
);
Expand Down Expand Up @@ -1016,15 +1031,13 @@ impl ChainNode {
} else {
// delete the data folder
progress_bar.start(format!("{}: Deleting the data folder", self.name()));
machine
.run("sudo rm -rf /data && sudo mkdir -p /data", false)
.await?;
machine.run("sudo rm -rf /data", false).await?;
progress_bar.inc(1);

// import the backup files
progress_bar.start(format!("{}: Importing the backup files", self.name()));
let command = format!(
"sudo gsutil -m cp -r gs://{}-persistence/{}/* /data",
"sudo gsutil -m cp -r gs://{}-persistence/{}/* /",
self.chain()?,
backup_name
);
Expand Down
1 change: 1 addition & 0 deletions zq2-devnet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ roles:
- api
- apps
- checkpoint
- persistence
versions:
zq2: v0.4.0
otterscan: develop
Expand Down
1 change: 1 addition & 0 deletions zq2-infratest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ roles:
- api
- apps
- checkpoint
- persistence
versions:
zq2: 30a24610
otterscan: develop
Expand Down
Loading
Loading