Skip to content

Commit

Permalink
Merge pull request #617 from alan-turing-institute/tier3-package-depe…
Browse files Browse the repository at this point in the history
…ndencies-collection

Package dependency collector for CRAN and PyPI
  • Loading branch information
jemrobinson authored Jun 9, 2020
2 parents 8d8da40 + 1d41bfe commit 65d87bf
Show file tree
Hide file tree
Showing 10 changed files with 2,161 additions and 1,109 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
secrets/
temp/

# Dependency cache for package whitelisting
deployment/administration/.dependency_cache.json

# Development tools
.vscode
.python-version
Expand Down
168 changes: 168 additions & 0 deletions deployment/administration/SHM_Expand_Whitelist_Dependencies.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
param(
[Parameter(Mandatory = $true, HelpMessage = "Mirror type to expand (either 'pypi' or 'cran')")]
[ValidateSet("pypi", "cran")]
[string]$MirrorType,
[Parameter(Mandatory = $true, HelpMessage = "API key for libraries.io")]
[string]$ApiKey
)

Import-Module $PSScriptRoot/../common/Logging.psm1 -Force


# Get normalised name for a package
# ---------------------------------
function Test-PackageExistence {
param(
[Parameter(Mandatory = $true, HelpMessage = "Name of package repository")]
$Repository,
[Parameter(Mandatory = $true, HelpMessage = "Name of package to get dependencies for")]
$Package,
[Parameter(Mandatory = $true, HelpMessage = "API key for libraries.io")]
$ApiKey
)
try {
$response = Invoke-RestMethod -URI https://libraries.io/api/${Repository}/${Package}?api_key=${ApiKey} -MaximumRetryCount 12 -RetryIntervalSec 5 -ErrorAction Stop
return $response
} catch [Microsoft.PowerShell.Commands.HttpResponseException] {
Add-LogMessage -Level Error "... $Package could not be found in ${Repository}"
throw $_.Exception # rethrow the original exception
}
}


# Get dependencies for all versions of a given package
# ----------------------------------------------------
function Get-Dependencies {
param(
[Parameter(Mandatory = $true, HelpMessage = "Name of package repository")]
$Repository,
[Parameter(Mandatory = $true, HelpMessage = "Name of package to get dependencies for")]
$Package,
[Parameter(Mandatory = $true, HelpMessage = "Versions of package to get dependencies for")]
$Versions,
[Parameter(Mandatory = $true, HelpMessage = "API key for libraries.io")]
$ApiKey,
[Parameter(Mandatory = $true, HelpMessage = "Hashtable containing cached dependencies")]
$Cache
)
$dependencies = @()
if ($Package -NotIn $Cache[$Repository].Keys) { $Cache[$Repository][$Package] = [ordered]@{} }
Add-LogMessage -Level Info "... found $($versions.Count) versions of $Package"
try {
foreach ($version in $Versions) {
if ($version -NotIn $Cache[$Repository][$Package].Keys) {
$response = Invoke-RestMethod -URI https://libraries.io/api/${Repository}/${Package}/${version}/dependencies?api_key=${ApiKey} -MaximumRetryCount 12 -RetryIntervalSec 5 -ErrorAction Stop
$Cache[$Repository][$Package][$version] = @($response.dependencies | Where-Object { $_.kind -ne "suggests" } | ForEach-Object { $_.name }) | Sort-Object | Uniq
}
$dependencies += $Cache[$Repository][$Package][$version]
}
} catch [Microsoft.PowerShell.Commands.HttpResponseException] {
Add-LogMessage -Level Error "... could not load dependencies for all versions of $Package"
}
if (-Not $dependencies) { return @() }
return $($dependencies | Sort-Object | Uniq)
}


# Load appropriate whitelists
# ---------------------------
$languageName = @{cran = "r"; pypi = "python"}[$MirrorType]
$coreWhitelistPath = Join-Path $PSScriptRoot ".." ".." "environment_configs" "package_lists" "whitelist-core-${languageName}-${MirrorType}-tier3.list"
$fullWhitelistPath = Join-Path $PSScriptRoot ".." ".." "environment_configs" "package_lists" "whitelist-full-${languageName}-${MirrorType}-tier3.list"
$dependencyCachePath = Join-Path $PSScriptRoot ".dependency_cache.json"

# Combine base image package lists with the core whitelist to construct a single list of core packages
# ----------------------------------------------------------------------------------------------------
$corePackageList = Get-Content $coreWhitelistPath
foreach ($packageWhitelist in (Get-Content (Join-Path $PSScriptRoot ".." "dsvm_images" "packages" "packages-${languageName}-${MirrorType}*.list"))) {
$corePackageList += $packageWhitelist
}
$corePackageList = $corePackageList | Sort-Object | Uniq


# Initialise the package queue
# ----------------------------
$queue = New-Object System.Collections.Queue
$corePackageList | ForEach-Object { $queue.Enqueue($_) }
$allDependencies = @()


# Load any previously-cached dependencies
$dependencyCache = [ordered]@{}
if (Test-Path $dependencyCachePath -PathType Leaf) {
$dependencyCache = Get-Content $dependencyCachePath | ConvertFrom-Json -AsHashtable
}
if ($MirrorType -NotIn $dependencyCache.Keys) { $dependencyCache[$MirrorType] = [ordered]@{} }
if ("unavailable_packages" -NotIn $dependencyCache.Keys) { $dependencyCache["unavailable_packages"] = [ordered]@{} }
if ($MirrorType -NotIn $dependencyCache["unavailable_packages"].Keys) { $dependencyCache["unavailable_packages"][$MirrorType] = @() }


# Resolve packages iteratively until the queue is empty
# -----------------------------------------------------
$packageWhitelist = @()
Add-LogMessage -Level Info "Preparing to expand dependencies for $($queue.Count) packages from $MirrorType"
while ($queue.Count) {
try {
$unverifiedName = $queue.Dequeue()
# Check that the package exists and add it to the whitelist if so
Add-LogMessage -Level Info "Determining canonical name for '$unverifiedName'"
$response = Test-PackageExistence -Repository $MirrorType -Package $unverifiedName -ApiKey $ApiKey
$versions = $response.versions | ForEach-Object { $_.number } | Sort-Object
$packageWhitelist += @($response.Name)
# Look for dependencies and add them to the queue
if ($versions) {
Add-LogMessage -Level Info "... finding dependencies for $($response.Name)"
$dependencies = Get-Dependencies -Repository $MirrorType -Package $response.Name -Versions $versions -ApiKey $ApiKey -Cache $dependencyCache
Add-LogMessage -Level Info "... found $($dependencies.Count) dependencies: $dependencies"
$newPackages = $dependencies | Where-Object { $_ -NotIn $packageWhitelist } | Where-Object { $_ -NotIn $allDependencies } | Where-Object { $_ -NotIn $dependencyCache["unavailable_packages"][$MirrorType] }
$newPackages | ForEach-Object { $queue.Enqueue($_) }
$allDependencies += $dependencies
} else {
Add-LogMessage -Level Warning "... could not find any versions of $($response.Name)"
}
} catch [Microsoft.PowerShell.Commands.HttpResponseException] {
# If this package could not be found then mark it as unavailable
Add-LogMessage -Level Error "... marking '$unverifiedName' as unavailable"
$dependencyCache["unavailable_packages"][$MirrorType] += @($unverifiedName) | Where-Object { $_ -NotIn $dependencyCache["unavailable_packages"][$MirrorType] }
}
Add-LogMessage -Level Info "... there are $($packageWhitelist.Count) packages on the expanded whitelist"
Add-LogMessage -Level Info "... there are $($queue.Count) packages in the queue"
# Write to the dependency file after each package in case the script terminates early
$dependencyCache | ConvertTo-Json -Depth 5 | Out-File $dependencyCachePath
}

# After processing all packages ensure that the dependencies cache is sorted
Add-LogMessage -Level Info "Sorting dependency cache..."
$sortedDependencies = [ordered]@{}
foreach ($repoName in $($dependencyCache.Keys | Sort-Object)) {
$sortedDependencies[$repoName] = [ordered]@{}
foreach ($pkgName in $($dependencyCache[$repoName].Keys | Sort-Object)) {
$sortedDependencies[$repoName][$pkgName] = [ordered]@{}
foreach ($version in $($dependencyCache[$repoName][$pkgName].Keys | Sort-Object)) {
$sortedDependencies[$repoName][$pkgName][$version] = @($dependencyCache[$repoName][$pkgName][$version] | Sort-Object | Uniq)
}
}
}
foreach ($repoName in $($dependencyCache["unavailable_packages"].Keys | Sort-Object)) {
$sortedDependencies["unavailable_packages"][$repoName] = @()
$sortedDependencies["unavailable_packages"][$repoName] += $dependencyCache["unavailable_packages"][$repoName] | Sort-Object | Uniq
}
$sortedDependencies | ConvertTo-Json -Depth 5 | Out-File $dependencyCachePath


# Add a log message for any problematic packages
# ----------------------------------------------
$unneededCorePackages = $corePackageList | Where-Object { $_ -In $allDependencies} | Sort-Object | Uniq
if ($unneededCorePackages) {
Add-LogMessage -Level Warning "... found $($unneededCorePackages.Count) core packages that would have been included as dependencies: $unneededCorePackages"
}
$unavailablePackages = $sortedDependencies["unavailable_packages"][$MirrorType]
if ($unavailablePackages) {
Add-LogMessage -Level Warning "... ignored $($unavailablePackages.Count) dependencies that could not be found in ${MirrorType}: $unavailablePackages"
}


# Write the full package list to the expanded whitelist
# -----------------------------------------------------
Add-LogMessage -Level Info "Writing $($packageWhitelist.Count) packages to the expanded whitelist..."
$packageWhitelist | Sort-Object | Uniq | Out-File $fullWhitelistPath
2 changes: 1 addition & 1 deletion deployment/administration/SHM_Update_Mirror_Whitelists.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if (-Not $whitelistDirectory) { $whitelistDirectory = Join-Path $PSScriptRoot ".
# -----------------------------------
foreach ($mirrorType in $mirrorTypes) {
$fullMirrorType = "${mirrorType}".ToLower().Replace("cran", "r-cran").Replace("pypi", "python-pypi")
$whitelistPath = Join-Path $whitelistDirectory "whitelist-core-${fullMirrorType}-tier${tier}.list".ToLower() -Resolve
$whitelistPath = Join-Path $whitelistDirectory "whitelist-full-${fullMirrorType}-tier${tier}.list".ToLower() -Resolve
$whiteList = Get-Content $whitelistPath -Raw -ErrorVariable notExists -ErrorAction SilentlyContinue
if ($notExists) {
Add-LogMessage -Level Failure "Could not find whitelist at '$whitelistPath'"
Expand Down
11 changes: 11 additions & 0 deletions deployment/dsvm_images/packages/packages-r-bioconductor.list
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
annotate
apeglm
Biobase
BiocGenerics
BiocInstaller
Biostrings
ChemmineR
clusterProfiler
ComplexHeatmap
Expand All @@ -16,19 +20,25 @@ fastseg
FlowSOM
flowUtils
ggtree
GlobalAncova
GO
GO.db
GOSemSim
GOstats
graph
graphite
GSEABase
GSVA
Gviz
interactiveDisplayBase
KEGGgraph
limma
made4
MassSpecWavelet
metagenomeSeq
minet
MLInterfaces
moe430a
monocle
pathview
pcaMethods
Expand All @@ -37,6 +47,7 @@ RankProd
RBGL
RDAVIDWebService
Rgraphviz
Rsamtools
safe
SC3
scater
Expand Down
16 changes: 2 additions & 14 deletions deployment/dsvm_images/packages/packages-r-cran.list
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ coda
codetools
colorRamps
colorspace
compiler
COMBAT
corrgram
corrplot
cowplot
Expand All @@ -32,7 +32,6 @@ crayon
CVST
cvTools
data.table
datasets
DBI
deepnet
devtools
Expand Down Expand Up @@ -84,9 +83,6 @@ ggvis
glmnet
googleVis
gplots
graphics
grDevices
grid
gridExtra
groupdata2
gtable
Expand Down Expand Up @@ -134,8 +130,6 @@ mboost
mclust
MCMCpack
McSpatial
methods
Metrics
mgcv
microbenchmark
mime
Expand All @@ -150,7 +144,6 @@ neuralnet
nlme
nnet
odbc
parallel
parallelMap
ParamHelpers
party
Expand Down Expand Up @@ -184,6 +177,7 @@ rgdal
rgeos
rgl
rJava
RMariaDB
rmarkdown
RMySQL
ROCR
Expand All @@ -204,11 +198,8 @@ sourcetools
sp
spacyr
spatial
splines
sqldf
stargazer
stats
stats4
stm
stringi
stringr
Expand All @@ -222,17 +213,14 @@ text2vec
tgp
threejs
tibble
tictoc
tidyr
tidytext
tidyverse
tmap
tools
topicmodels
traj
tsne
urca
utils
uuid
varbvs
varhandle
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ function Deploy-PackageMirror {
# --------------------
$cloudInitPath = Join-Path $PSScriptRoot ".." "cloud_init" "cloud-init-mirror-${mirrorDirection}-${MirrorType}.yaml".ToLower()
$fullMirrorType = "${MirrorType}".ToLower().Replace("cran", "r-cran").Replace("pypi", "python-pypi")
$whitelistPath = Join-Path $PSScriptRoot ".." ".." ".." "environment_configs" "package_lists" "whitelist-core-${fullMirrorType}-tier${tier}.list".ToLower() # do not resolve this path as we have not tested whether it exists yet
$whitelistPath = Join-Path $PSScriptRoot ".." ".." ".." "environment_configs" "package_lists" "whitelist-full-${fullMirrorType}-tier${tier}.list".ToLower() # do not resolve this path as we have not tested whether it exists yet
$cloudInitYaml = Resolve-CloudInit -MirrorType $MirrorType -MirrorDirection $MirrorDirection -CloudInitPath $cloudInitPath -WhitelistPath $whitelistPath

# Construct IP address for this mirror
Expand Down
Loading

0 comments on commit 65d87bf

Please sign in to comment.