Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing start, stop with ec2.py, adding validations #1191

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
3b11c13
Removing start, stop with ec2.py, adding validations
abuabraham-ttd Dec 9, 2024
6034c5e
Removing start, stop with ec2.py, adding validations
abuabraham-ttd Dec 9, 2024
d1f1756
Updates
abuabraham-ttd Dec 10, 2024
f42f872
Updates
abuabraham-ttd Dec 10, 2024
2542232
Add virtual env and start it in systemd
abuabraham-ttd Dec 10, 2024
edf85f3
Add virtual env and start it in systemd
abuabraham-ttd Dec 10, 2024
3e95e4c
Add virtual env and start it in systemd
abuabraham-ttd Dec 10, 2024
cb70032
use venv like flask service
abuabraham-ttd Dec 10, 2024
5fe844c
use versions
abuabraham-ttd Dec 10, 2024
937e7a2
Add URL validation
abuabraham-ttd Dec 10, 2024
2b23ff0
Move validations around
abuabraham-ttd Dec 10, 2024
44aa71f
Move validations around
abuabraham-ttd Dec 10, 2024
711d50b
Move validations around
abuabraham-ttd Dec 10, 2024
4c694e7
Remove aws implemnttion from typedict
abuabraham-ttd Dec 10, 2024
62cc490
Remove aws implemnttion from typedict
abuabraham-ttd Dec 10, 2024
5de70be
Adding more logs
abuabraham-ttd Dec 11, 2024
77f1f4a
Adding min capacity
abuabraham-ttd Dec 11, 2024
a4241fc
Loop every sec for 10sec for confg server to be up
abuabraham-ttd Dec 11, 2024
0bff456
Fix regex
abuabraham-ttd Dec 11, 2024
e669887
validate after default
abuabraham-ttd Dec 11, 2024
85fc3e7
Add tested min values for capacity
abuabraham-ttd Dec 11, 2024
d7b24c7
[CI Pipeline] Released Snapshot version: 5.43.1-alpha-93-SNAPSHOT
Dec 12, 2024
4499dcf
Add to build eif stage
abuabraham-ttd Dec 12, 2024
3dd967d
[CI Pipeline] Released Snapshot version: 5.43.2-alpha-94-SNAPSHOT
Dec 12, 2024
45b2908
Dont check for enclave, kill all
abuabraham-ttd Dec 12, 2024
d890e5d
Change version on ami build
abuabraham-ttd Dec 12, 2024
8eeaf9a
[CI Pipeline] Released Snapshot version: 5.43.3-alpha-100-SNAPSHOT
Dec 13, 2024
eb8955c
Use AuxilaryConfig to store and return URLs
abuabraham-ttd Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/aws/config-server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Flask==2.3.2
Werkzeug==3.0.3
setuptools==70.0.0
setuptools==70.0.0
220 changes: 220 additions & 0 deletions scripts/aws/ec2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env python3

import boto3
import json
import os
import subprocess
import re
import multiprocessing
import requests
import signal
import argparse
from botocore.exceptions import ClientError
from typing import Dict
import sys
import time
import yaml

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from confidential_compute import ConfidentialCompute, ConfidentialComputeConfig, SecretNotFoundException


class AWSConfidentialComputeConfig(ConfidentialComputeConfig):
enclave_memory_mb: int
enclave_cpu_count: int


class EC2(ConfidentialCompute):

def __init__(self):
super().__init__()

def __get_aws_token(self) -> str:
"""Fetches a temporary AWS EC2 metadata token."""
try:
token_url = "http://169.254.169.254/latest/api/token"
response = requests.put(
token_url, headers={"X-aws-ec2-metadata-token-ttl-seconds": "3600"}, timeout=2
)
return response.text
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch aws token: {e}")

def __get_current_region(self) -> str:
"""Fetches the current AWS region from EC2 instance metadata."""
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
token = self.__get_aws_token()
metadata_url = "http://169.254.169.254/latest/dynamic/instance-identity/document"
headers = {"X-aws-ec2-metadata-token": token}
try:
response = requests.get(metadata_url, headers=headers, timeout=2)
response.raise_for_status()
return response.json()["region"]
except requests.RequestException as e:
raise RuntimeError(f"Failed to fetch region: {e}")

def __validate_aws_specific_config(self, secret):
if "enclave_memory_mb" in secret or "enclave_cpu_count" in secret:
max_capacity = self.__get_max_capacity()
for key in ["enclave_memory_mb", "enclave_cpu_count"]:
if int(secret.get(key, 0)) > max_capacity.get(key):
raise ValueError(f"{key} value ({secret.get(key, 0)}) exceeds the maximum allowed ({max_capacity.get(key)}).")
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved

def _get_secret(self, secret_identifier: str) -> AWSConfidentialComputeConfig:
"""Fetches a secret value from AWS Secrets Manager."""

def add_defaults(configs: Dict[str, any]) -> AWSConfidentialComputeConfig:
"""Adds default values to configuration if missing."""
default_capacity = self.__get_max_capacity()
configs.setdefault("enclave_memory_mb", default_capacity["enclave_memory_mb"])
configs.setdefault("enclave_cpu_count", default_capacity["enclave_cpu_count"])
configs.setdefault("debug_mode", False)
return configs

region = self.__get_current_region()
try:
client = boto3.client("secretsmanager", region_name=region)
except Exception as e:
raise RuntimeError("Please use IAM instance profile for your instance that has permission to access Secret Manager")
try:
secret = json.loads(client.get_secret_value(SecretId=secret_identifier)["SecretString"])
self.__validate_aws_specific_config(secret)
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
return add_defaults(secret)
except ClientError as _:
raise SecretNotFoundException(f"{secret_identifier} in {region}")

@staticmethod
def __get_max_capacity():
try:
with open("/etc/nitro_enclaves/allocator.yaml", "r") as file:
nitro_config = yaml.safe_load(file)
return {"enclave_memory_mb": nitro_config['memory_mib'], "enclave_cpu_count": nitro_config['cpu_count']}
except Exception as e:
raise RuntimeError("/etc/nitro_enclaves/allocator.yaml does not have CPU, memory allocated")

def __setup_vsockproxy(self, log_level: int) -> None:
"""
Sets up the vsock proxy service.
"""
thread_count = (multiprocessing.cpu_count() + 1) // 2
command = [
"/usr/bin/vsockpx", "-c", "/etc/uid2operator/proxy.yaml",
"--workers", str(thread_count), "--log-level", str(log_level), "--daemon"
]
self.run_command(command)

def __run_config_server(self) -> None:
"""
Starts the Flask configuration server.
"""
os.makedirs("/etc/secret/secret-value", exist_ok=True)
config_path = "/etc/secret/secret-value/config"
with open(config_path, 'w') as config_file:
json.dump(self.configs, config_file)
os.chdir("/opt/uid2operator/config-server")
command = ["./bin/flask", "run", "--host", "127.0.0.1", "--port", "27015"]
self.run_command(command, seperate_process=True)

def __run_socks_proxy(self) -> None:
"""
Starts the SOCKS proxy service.
"""
command = ["sockd", "-D"]
self.run_command(command)

def __get_secret_name_from_userdata(self) -> str:
"""Extracts the secret name from EC2 user data."""
token = self.__get_aws_token()
user_data_url = "http://169.254.169.254/latest/user-data"
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
response = requests.get(user_data_url, headers={"X-aws-ec2-metadata-token": token})
user_data = response.text

with open("/opt/uid2operator/identity_scope.txt") as file:
identity_scope = file.read().strip()

default_name = f"{identity_scope.lower()}-operator-config-key"
hardcoded_value = f"{identity_scope.upper()}_CONFIG_SECRET_KEY"
match = re.search(rf'^export {hardcoded_value}="(.+?)"$', user_data, re.MULTILINE)
return match.group(1) if match else default_name

def _setup_auxiliaries(self) -> None:
"""Sets up the necessary auxiliary services and configuration."""
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
self.configs = self._get_secret(self.__get_secret_name_from_userdata())
self.validate_configuration()
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
log_level = 3 if self.configs["debug_mode"] else 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this right translation from the old bash script:

VSOCK_LOG_LEVEL=${VSOCK_LOG_LEVEL:-3}
?
(granted don't quite understand this bash script)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if VSOCK_LOG_LEVEL ? then VSOCK_LOG_LEVEL else 3

there is no way to set VSOCK_LOG_LEVEL though.

So I added if debug_mode ? then set to 3 otherwise 1.

In future ticket, we will be adding more tracing on debug mode.

Plan is to evaluate adding

echo 1 > /sys/kernel/debug/tracing/events/vsock/enable
echo 1 > /sys/kernel/debug/tracing/tracing_on

self.__setup_vsockproxy(log_level)
self.__run_config_server()
self.__run_socks_proxy()
time.sleep(5) #TODO: Change to while loop if required.
Copy link
Contributor

@sunnywu sunnywu Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to sleep here? doesn't subprocess.run already run the command synchronously and will wait for its completion? For config server, it would be run as separate process so will never need to wait for it anyway?

or is this to wait for config server to startup so you could do validations? if so, you might wanna consider making it more robust (are we 100% sure 5 seconds wait is enough?), or at the very least, logs something during this 5 second wait to inform the customer this script is still running but waiting for something.

or maybe it should be in a loop and every 5 seconds the validation script will try to connect to the server until it's successful.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or is this to wait for config server to startup so you could do validations? i

Yes. we can "Change to while loop if required. " as mentioned in comment.


def _validate_auxiliaries(self) -> None:
"""Validates auxiliary services."""
proxy = "socks5://127.0.0.1:3306"
config_url = "http://127.0.0.1:27015/getConfig"
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
try:
response = requests.get(config_url)
response.raise_for_status()
except requests.RequestException as e:
raise RuntimeError(f"Config server unreachable: {e}")
proxies = {"http": proxy, "https": proxy}
try:
response = requests.get(config_url, proxies=proxies)
response.raise_for_status()
except requests.RequestException as e:
raise RuntimeError(f"Cannot connect to config server via SOCKS proxy: {e}")

def run_compute(self) -> None:
"""Main execution flow for confidential compute."""
self._setup_auxiliaries()
self._validate_auxiliaries()
command = [
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
"nitro-cli", "run-enclave",
"--eif-path", "/opt/uid2operator/uid2operator.eif",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be helpful for debugging to have the ability to pass this in.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

We need to build an EIF and register the EIF in core. Why not just use the ami build along with EIF? otherwise we have to copy the registered EIF and run that by SSH ing into the host right?

"--memory", str(self.configs["enclave_memory_mb"]),
"--cpu-count", str(self.configs["enclave_cpu_count"]),
"--enclave-cid", "42",
"--enclave-name", "uid2operator"
]
if self.configs["debug_mode"]:
command += ["--debug-mode", "--attach-console"]
self.run_command(command)

def cleanup(self) -> None:
"""Terminates the Nitro Enclave and auxiliary processes."""
try:
describe_output = subprocess.check_output(["nitro-cli", "describe-enclaves"], text=True)
abuabraham-ttd marked this conversation as resolved.
Show resolved Hide resolved
enclaves = json.loads(describe_output)
enclave_id = enclaves[0].get("EnclaveID") if enclaves else None
if enclave_id:
self.run_command(["nitro-cli", "terminate-enclave", "--enclave-id", enclave_id])
print(f"Terminated enclave with ID: {enclave_id}")
else:
print("No active enclaves found.")
self.__kill_auxiliaries()
except subprocess.SubprocessError as e:
raise (f"Error during cleanup: {e}")

def __kill_auxiliaries(self) -> None:
"""Kills a process by its name."""
try:
for process_name in ["vsockpx", "sockd", "flask"]:
result = subprocess.run(["pgrep", "-f", process_name], stdout=subprocess.PIPE, text=True, check=False)
if result.stdout.strip():
for pid in result.stdout.strip().split("\n"):
os.kill(int(pid), signal.SIGKILL)
print(f"Killed process '{process_name}'.")
else:
print(f"No process named '{process_name}' found.")
except Exception as e:
print(f"Error killing process '{process_name}': {e}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Manage EC2-based confidential compute workflows.")
parser.add_argument("-o", "--operation", choices=["stop", "start"], default="start", help="Operation to perform.")
args = parser.parse_args()
ec2 = EC2()
if args.operation == "stop":
sunnywu marked this conversation as resolved.
Show resolved Hide resolved
ec2.cleanup()
else:
ec2.run_compute()

4 changes: 4 additions & 0 deletions scripts/aws/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
requests[socks]==2.32.3
boto3==1.35.59
urllib3==1.26.20
PyYAML===6.0.2
124 changes: 0 additions & 124 deletions scripts/aws/start.sh

This file was deleted.

31 changes: 0 additions & 31 deletions scripts/aws/stop.sh

This file was deleted.

Loading
Loading