Skip to content

Commit

Permalink
x86_64 deployment and setup for us-east-2b (use2-az2) (#75)
Browse files Browse the repository at this point in the history
* consistent defaults
* us-east-2 backend
* fix new VPC deployment
* terraform fixes
* spack ncurses issue fixed
* Newer efa driver not supported on Stream 8
* spack-stack in-progress
* deployable
* x86_64 RHEL8 setup
* turn off public IP association
* cbofs builds and runs
* making stacksize unlimited for running models
* Updated outputs.tf for conditional display of aws_eip related info if present
* retesting cbofs
* Debugged x86_64 (#74)
* debugging dependency issues
* liveocean and cbofs run again
* recommend at least a t3.xlarge for setup and building models, the image type can be changed non-destructively with terraform
* fix misleading error message
* using private ip for hosts
* no public IP on nodes, fixed efa enable
* fixed no ssh checks for private IP addresses
* plotting debug
* Add availability_zone variable to placement group name to avoid name conflicts
* Quick fix for naming collisions for IAM stuff

---------

Co-authored-by: Zachary Wills <[email protected]>
Co-authored-by: Micah Wengren <[email protected]>
  • Loading branch information
3 people authored Jan 31, 2024
1 parent 62e81b2 commit 7ecc756
Show file tree
Hide file tree
Showing 110 changed files with 1,373 additions and 749 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ terraform/SAVE.mysettings.tfvars
dev.tfvars
my.tfvars
my.plan
build/
cloudflow/build/
10 changes: 10 additions & 0 deletions TODO
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
add ulimit -s unlimited to every user profile

commit changes
test other builds
clean up
commit and tag as 1.5.0b
commit and tag nosofs as ioos.1.5.0b

Test liveocean build and run
commit and tag liveocean as ioos.1.5.0b
21 changes: 14 additions & 7 deletions cloudflow/cluster/AWSCluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
formatter = logging.Formatter(' %(asctime)s %(levelname)s | %(message)s')
fh.setFormatter(formatter)

efatypes=['c5', 'hpc' ]

# To avoid duplicate entries, only have one handler
# This log might have a handler in one of their higher level scripts
# This didn't work - still got duplicates, even when also added in main caller
Expand Down Expand Up @@ -345,7 +347,8 @@ def getHosts(self):
hosts = []

for instance in self.__instances:
hosts.append(instance.private_dns_name)
# hosts.append(instance.private_dns_name)
hosts.append(instance.private_ip_address)
return hosts


Expand All @@ -364,7 +367,7 @@ def getHostsCSV(self):
cnt = 0
for instance in self.__instances:
cnt += 1
hostname = instance.private_dns_name
hostname = instance.private_ip_address
# no comma on last host
if cnt == instcnt:
hosts += hostname
Expand All @@ -375,12 +378,15 @@ def getHostsCSV(self):


def __placementGroup(self):
""" This is a bit of a hack to satisfy AWS. Only c5 and c5n type of instances support placement group """
""" This is a bit of a hack to satisfy AWS. Only some instances support placement group """

group = {}
if self.nodeType.startswith('c5'):


if self.nodeType.startswith(tuple(efatypes)):
group = {'GroupName': self.placement_group}


return group


Expand All @@ -393,16 +399,17 @@ def __netInterface(self):
Also attaches security groups """

interface = {
'AssociatePublicIpAddress': True,
'AssociatePublicIpAddress': False,
'DeleteOnTermination': True,
'Description': 'Network adaptor via boto3 api',
'Description': 'Network adaptor via cloudflow boto3 api',
'DeviceIndex': 0,
'Groups': self.sg_ids,
'SubnetId': self.subnet_id
}

# if self.nodeType == 'c5n.18xlarge':
if self.nodeType in nodeInfo.efaTypes:

if self.nodeType.startswith(tuple(efatypes)):
interface['InterfaceType'] = 'efa'

return interface
Expand Down
16 changes: 0 additions & 16 deletions cloudflow/cluster/configs/adnoc.config

This file was deleted.

16 changes: 0 additions & 16 deletions cloudflow/cluster/configs/debug.config

This file was deleted.

14 changes: 7 additions & 7 deletions cloudflow/cluster/configs/ioos.amd.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
"platform" : "AWS",
"region" : "us-east-2",
"nodeType" : "hpc6a.48xlarge",
"nodeCount" : 1,
"nodeCount" : 2,
"tags" : [
{ "Key": "Name", "Value": "IOOS-cloud-sandbox" },
{ "Key": "Project", "Value": "IOOS-cloud-sandbox" },
{ "Key": "NAME", "Value": "nosofs-fcst" }
],
"image_id" : "ami-085591407d6dd69bf",
"image_id" : "ami-01c437ac2b5beddda",
"key_name" : "ioos-sandbox",
"sg_ids" : [
"sg-03e4479bc61057a30",
"sg-07588b4230c8e29b0",
"sg-03b2af1315a3fc526"
"sg-0c72b9cd2d5143515",
"sg-0fd05da3e3474d0e1",
"sg-05d4f1d9145a6666b"
],
"subnet_id" : "subnet-08689a3c6a43e807d",
"placement_group" : "Patrick-ioos-cloud-sandbox_Terraform_Placement_Group"
"subnet_id" : "subnet-0b78841b900162b6c",
"placement_group" : "Patrick-x86_64-sandbox_Terraform_Placement_Group"
}
16 changes: 8 additions & 8 deletions cloudflow/cluster/configs/ioos.config
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"platform" : "AWS",
"region" : "us-east-2",
"nodeType" : "c5n.18xlarge",
"nodeCount" : 4,
"nodeType" : "hpc6a.48xlarge",
"nodeCount" : 2,
"tags" : [
{ "Key": "Name", "Value": "IOOS-cloud-sandbox" },
{ "Key": "Project", "Value": "IOOS-cloud-sandbox" },
{ "Key": "NAME", "Value": "nosofs-fcst" }
],
"image_id" : "ami-052928467e84be78c",
"image_id" : "ami-0a4b30237fee5d223",
"key_name" : "ioos-sandbox",
"sg_ids" : [
"sg-03e4479bc61057a30",
"sg-07588b4230c8e29b0",
"sg-03b2af1315a3fc526"
"sg-0810cdb0357f0a92c",
"sg-058243450eb3b4308",
"sg-0b2a044dad6a21156"
],
"subnet_id" : "subnet-08689a3c6a43e807d",
"placement_group" : "Patrick-ioos-cloud-sandbox_Terraform_Placement_Group"
"subnet_id" : "subnet-0a83f73406e809b00",
"placement_group" : "Patrick-x86_64-sandbox_Terraform_Placement_Group"
}
20 changes: 0 additions & 20 deletions cloudflow/cluster/configs/ioos.slurm.cluster

This file was deleted.

15 changes: 0 additions & 15 deletions cloudflow/cluster/configs/jupyterhub_post.config

This file was deleted.

16 changes: 0 additions & 16 deletions cloudflow/cluster/configs/liveocean.qops.fcst

This file was deleted.

16 changes: 0 additions & 16 deletions cloudflow/cluster/configs/liveocean.qops.post

This file was deleted.

15 changes: 0 additions & 15 deletions cloudflow/cluster/configs/nyh-hindcasts.config

This file was deleted.

20 changes: 0 additions & 20 deletions cloudflow/cluster/configs/slurm.config

This file was deleted.

16 changes: 0 additions & 16 deletions cloudflow/cluster/configs/wrfroms.config

This file was deleted.

2 changes: 1 addition & 1 deletion cloudflow/job/ROMSForecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def parseConfig(self, cfDict):

if self.OCNINTMPL == "auto":
self.OCNINTMPL = f"{self.TEMPLPATH}/{self.OFS}.ocean.in"

return


Expand Down
6 changes: 3 additions & 3 deletions cloudflow/job/jobs/cbofs.00z.fcst
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
"OFS" : "cbofs",
"CDATE" : "today",
"HH" : "00",
"COMROT" : "/com/patrick.tripp/nosofs",
"PTMP" : "/ptmp/patrick.tripp",
"COMROT" : "/com/ec2-user/nosofs",
"PTMP" : "/ptmp/ec2-user",
"EXEC" : "",
"TIME_REF" : "20160101.0d0",
"BUCKET" : "ioos-cloud-sandbox",
"BCKTFLDR" : "nosofs/patrick.tripp/cbofs/output",
"BCKTFLDR" : "nosofs/ec2-user/cbofs/output",
"NTIMES" : "34560",
"ININAME" : "",
"OUTDIR" : "auto",
Expand Down
17 changes: 17 additions & 0 deletions cloudflow/job/jobs/cbofs.20231207
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"JOBTYPE" : "romsforecast",
"OFS" : "cbofs",
"CDATE" : "20231207",
"HH" : "06",
"COMROT" : "/com/ec2-user/nosofs",
"PTMP" : "/ptmp/ec2-user",
"EXEC" : "",
"TIME_REF" : "20160101.0d0",
"BUCKET" : "ioos-cloud-sandbox",
"BCKTFLDR" : "nosofs/patrick.tripp/cbofs/output",
"NTIMES" : "34560",
"ININAME" : "",
"OUTDIR" : "auto",
"OCEANIN" : "auto",
"OCNINTMPL" : "auto"
}
6 changes: 3 additions & 3 deletions cloudflow/job/jobs/cbofs.fcst
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
"JOBTYPE" : "romsforecast",
"OFS" : "cbofs",
"CDATE" : "today",
"HH" : "06",
"COMROT" : "/com/patrick.tripp/nosofs",
"PTMP" : "/ptmp/patrick.tripp",
"HH" : "00",
"COMROT" : "/com/ec2-user/nosofs",
"PTMP" : "/ptmp/ec2-user",
"EXEC" : "",
"TIME_REF" : "20160101.0d0",
"BUCKET" : "ioos-cloud-sandbox",
Expand Down
13 changes: 13 additions & 0 deletions cloudflow/job/jobs/liveocean.diffplots
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"JOBTYPE" : "plotting_diff",
"OFS" : "liveocean",
"CDATE" : "20170601",
"HH" : "00",
"INDIR" : "/com/ec2-user/LO_roms/cas6_traps2_x2b/f2017.06.01",
"OUTDIR" : "/save/ec2-user/LO_plots",
"VERIFDIR" : "/com/ec2-user/apogee/f2017.06.01",
"VARS" : ["temp", "zeta", "w", "salt"],
"BUCKET" : "ioos-cloud-sandbox",
"BCKTFLDR" : "LiveOcean/plots",
"FSPEC" : "ocean_his_*.nc"
}
2 changes: 1 addition & 1 deletion cloudflow/job/jobs/liveocean.fcst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"OFS" : "liveocean",
"CDATE" : "20170601",
"HH" : "00",
"COMROT" : "/com/patrick.tripp",
"COMROT" : "/com/ec2-user",
"PTMP" : "/ptmp",
"EXEC" : "",
"TIME_REF" : "19700101",
Expand Down
5 changes: 3 additions & 2 deletions cloudflow/job/templates/liveocean.ocean.in
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@
! Input variable information file name. This file needs to be processed
! first so all information arrays can be initialized properly.

VARNAME = /save/ioos/patrick.tripp/LiveOcean/LO_roms_source_alt/varinfo/varinfo.yaml
#VARNAME = __SAVE__/LiveOcean/LO_roms_source_alt/varinfo/varinfo.yaml
VARNAME = varinfo.yaml

! Number of nested grids.

Expand Down Expand Up @@ -1012,7 +1013,7 @@ PIO_I2C_Preq = 65 ! Maximum pending I2C requests

NBCFILES == 1 ! number of boundary files

BRYNAME == /com/patrick.tripp/LO_output/forcing/cas6/__FDATE__/ocn00/ocean_bry.nc
BRYNAME == __COMROT__/LO_output/forcing/cas6/__FDATE__/ocn00/ocean_bry.nc

! Input climatology file names. The USER has the option to separate the
! climatology variables into individual NetCDF files (NCLMFILES > 1),
Expand Down
Loading

0 comments on commit 7ecc756

Please sign in to comment.