resolve

Alexander-Barth · Jan 19, 2024 · cb04c4e · cb04c4e
2 parents d089d0c + a7dd993
commit cb04c4e
Show file tree

Hide file tree

Showing 18 changed files with 131 additions and 78 deletions.
diff --git a/README.md b/README.md
@@ -192,16 +192,14 @@ and computes the maximum of each slice and the average of each maximum over all
 This operation is repeated 100 times.
 The code is available at https://github.com/Alexander-Barth/NCDatasets.jl/tree/master/test/perf .
 
-
 | Module           | median | minimum |  mean | std. dev. |
 |:---------------- | ------:| -------:| -----:| ---------:|
-| R-ncdf4          |  0.362 |   0.342 | 0.364 |     0.013 |
-| python-netCDF4   |  0.557 |   0.534 | 0.561 |     0.013 |
-| julia-NCDatasets |  0.164 |   0.161 | 0.170 |     0.011 |
+| R-ncdf4          |  0.407 |   0.384 | 0.407 |     0.010 |
+| python-netCDF4   |  0.475 |   0.463 | 0.476 |     0.010 |
+| julia-NCDatasets |  0.265 |   0.249 | 0.267 |     0.011 |
 
-All runtimes are in seconds.
-Julia 1.9.0 (with NCDatasets 0.12.16), R 4.1.2 (with ncdf4 1.21) and Python 3.10.6 (with netCDF4 1.6.1).
-This CPU is a i7-7700.
+All runtimes are in seconds. We use Julia 1.10.0 (with NCDatasets 0.14.0), R 4.1.2 (with ncdf4 1.22) and Python 3.10.12 (with netCDF4 1.6.5)
+on a i5-1135G7 CPU and NVMe SSD (WDC WDS100T2B0C).
 
 
 # Filing an issue
@@ -230,7 +228,7 @@ The package [NetCDF.jl](https://github.com/JuliaGeo/NetCDF.jl) from Fabian Gans
 
 # Credits
 
-`netcdf_c.jl`, `build.jl` and the error handling code of the NetCDF C API are from NetCDF.jl by Fabian Gans (Max-Planck-Institut für Biogeochemie, Jena, Germany) released under the MIT license.
+`netcdf_c.jl` and the error handling code of the NetCDF C API are from NetCDF.jl by Fabian Gans (Max-Planck-Institut für Biogeochemie, Jena, Germany) released under the MIT license.
 
 
 

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -5,4 +5,4 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 
 [compat]
-Documenter = "0.27"
+Documenter = "1"
diff --git a/docs/make.jl b/docs/make.jl
@@ -18,8 +18,8 @@ makedocs(
         "Attributes" => "attributes.md",
         "Fill values" => "fillvalue.md",
         "Performance tips" => "performance.md",
+        "Other features" => "other.md",
         "Known issues" => "issues.md",
-        "Experimental features" => "experimental.md",
         "Tutorials" => "tutorials.md",
     ],
     checkdocs = :none,

diff --git a/docs/src/dataset.md b/docs/src/dataset.md
@@ -48,8 +48,8 @@ close(ds)
 
 ```@docs
 defGroup
-getindex(g::NCDatasets.Groups,groupname::AbstractString)
-Base.keys(g::NCDatasets.Groups)
+getindex(g::NCDatasets.Groups,groupname)
+keys(g::NCDatasets.Groups)
 ```
 
 ## Common methods

diff --git a/docs/src/dimensions.md b/docs/src/dimensions.md
@@ -9,6 +9,7 @@ haskey(a::NCDatasets.NCIterable,name::AbstractString)
 defDim
 unlimited(d::NCDatasets.Dimensions)
 setindex!(d::NCDatasets.Dimensions,len,name::AbstractString)
+renameDim(ds::NCDataset,oldname::Union{AbstractString,Symbol},newname::Union{AbstractString,Symbol})
 ```
 
 One can iterate over a list of dimensions as follows:

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -6,7 +6,7 @@ in [CommonDataModel.jl](https://github.com/JuliaGeo/CommonDataModel.jl).
 All functions defined by CommonDataModel.jl are also available for NetCDF data, including:
 * virtually concatenating multiple files along a given dimension
 * create a virtual subset (`view`) by indices or by values of coordinate variables (`CommonDataModel.select`, `CommonDataModel.@select`)
-* group, map and reduce (with `mean`, standard deviation `std`, ...) a variable (`CommonDataModel.groupby`, `CommonDataModel.@groupby`) and rolling reductions like running means `CommonDataModel.rolling`). 
+* group, map and reduce (with `mean`, standard deviation `std`, ...) a variable (`CommonDataModel.groupby`, `CommonDataModel.@groupby`) and rolling reductions like running means `CommonDataModel.rolling`).
 
 ## Installation
 
@@ -30,13 +30,13 @@ Pkg.add(PackageSpec(name="NCDatasets", rev="master"))
 
 ## Contents
 
-To get started quickly see the [Quickstart](@ref) section. Otherwise see the following pages for details:
+To get started quickly see the [Quick start](@ref) section. Otherwise see the following pages for details:
 
 * [Datasets](@ref) : reading/writing NetCDF datasets (including NetCDF groups) and examining their contents.
 * [Dimensions](@ref) : accessing/creating NetCDF dimensions
 * [Variables](@ref) : accessing/examining the variables (or dimensions) stored within a NetCDF dataset.
 * [Attributes](@ref) : accessing/creating NetCDF attributes
-* See [Fill values and missing values](@ref), [Performance tips](@ref performance_tips), [Known issues](@ref), [Experimental features](@ref) for more information.
+* See [Fill values and missing values](@ref), [Performance tips](@ref performance_tips), [Other features](@ref) and [Known issues](@ref) for more information.
 
 ## Quick start
 
@@ -337,4 +337,4 @@ close(ds)
 ## API and semantic versioning
 
 The package aims to following [semantic versioning](https://semver.org/).
-[As in julia](https://docs.julialang.org/en/v1/manual/faq/#How-does-Julia-define-its-public-API), what is considered as public API and covered by semantic versioning is what documented and not marked as experimental or internal. 
+[As in julia](https://docs.julialang.org/en/v1/manual/faq/#How-does-Julia-define-its-public-API), what is considered as public API and covered by semantic versioning is what documented and not marked as experimental or internal.
diff --git a/docs/src/issues.md b/docs/src/issues.md
@@ -72,7 +72,7 @@ Having outdated versions of HDF5 or NetCDF libraries installed can be an issue o
 
 ## Using a custom NetCDF library
 
-The NetCDF library `libnetcdf.so` is installed as an artifact via the package `NetCDF_jll`.
+The NetCDF library `libnetcdf.so` is installed as an artifact via the package [NetCDF_jll](https://github.com/JuliaBinaryWrappers/NetCDF_jll.jl).
 You can override which `libnetcdf.so` gets loaded through the `Preferences` package, as follows:
 
 ``` julia
@@ -126,7 +126,7 @@ Julia need to be restarted after this file is placed in the your working directo
 
 ## Using non-official julia builds
 
-Julia and NetCDF_jll have several common dependencies (curl, MbedTLS, zlib).
+Julia and [NetCDF_jll](https://github.com/JuliaBinaryWrappers/NetCDF_jll.jl) have several common dependencies (curl, MbedTLS, zlib).
 Non-official julia builds will work only if they use exactly the same library version as those used to compile NetCDF. This is unlikely to be the case in general and outside of our control. Therefore non-official julia builds are not supported.
 Official julia builds are available at [https://julialang.org/downloads/](https://julialang.org/downloads/).
 

diff --git a/docs/src/experimental.md → docs/src/other.md b/docs/src/experimental.md → docs/src/other.md
@@ -1,4 +1,4 @@
-# Experimental features
+# Other features
 
 ## Multi-file support
 

diff --git a/src/NCDatasets.jl b/src/NCDatasets.jl
@@ -61,7 +61,6 @@ const default_timeunits = "days since 1900-00-00 00:00:00"
 const SymbolOrString = Union{Symbol, AbstractString}
 
 include("types.jl")
-include("colors.jl")
 include("errorhandling.jl")
 include("netcdf_c.jl")
 include("dataset.jl")

diff --git a/src/colors.jl b/src/colors.jl
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -351,7 +351,12 @@ name `temperature` and a dimension with the name `lon`.
 """
 Base.haskey(a::NCIterable,name::AbstractString) = name in keys(a)
 
+"""
+    names = dimnames(ds::AbstractNCDataset; parents = false)
 
+Return all names defined in `ds`. When `parents` is `true`,
+also the names of parent groups are returned (default is `false`).
+"""
 function dimnames(ds::AbstractNCDataset; parents = false)
     dn = keys(ds.dim)
 
@@ -365,10 +370,12 @@ function dimnames(ds::AbstractNCDataset; parents = false)
     return dn
 end
 
-dim(ds::AbstractNCDataset,name::SymbolOrString) = ds.dim[name]
-
+"""
+    len = dim(ds::AbstractNCDataset,name::SymbolOrString)
 
-#     write(dest_filename::AbstractString, src::AbstractNCDataset; include = keys(src), exclude = [])
+Return the (current) length of the dimension `name` of the dataset `ds`.
+"""
+dim(ds::AbstractNCDataset,name::SymbolOrString) = ds.dim[name]
 
 
 function Base.write(dest_filename::AbstractString, src::AbstractDataset; kwargs...)

diff --git a/src/netcdf_c.jl b/src/netcdf_c.jl
@@ -1037,6 +1037,24 @@ function nc_def_var_deflate(ncid::Integer,varid::Integer,shuffle::Bool,deflate::
     check(ccall((:nc_def_var_deflate,libnetcdf),Cint,(Cint,Cint,Cint,Cint,Cint),ncid,varid,shuffle,deflate,deflate_level))
 end
 
+# filters
+function nc_inq_filter_avail(ncid::Integer,id::Integer)
+    ret = ccall((:nc_inq_filter_avail,libnetcdf),Cint,(Cint,Cuint),ncid,id)
+    return ret == NC_NOERR
+end
+
+function nc_def_var_zstandard(ncid::Integer,varid::Integer,level::Integer)
+    check(ccall((:nc_def_var_zstandard,libnetcdf),Cint,(Cint,Cint,Cint),ncid,varid,level))
+end
+
+function nc_inq_var_zstandard(ncid::Integer,varid::Integer)
+    hasfilterp = Ref(Cint(0))
+    levelp = Ref(Cint(0))
+    check(ccall((:nc_inq_var_zstandard,libnetcdf),Cint,(Cint,Cint,Ptr{Cint},Ptr{Cint}),ncid,varid,hasfilterp,levelp))
+
+    return hasfilterp[] == 1, levelp[]
+end
+
 function nc_inq_var_deflate(ncid::Integer,varid::Integer)
     shufflep = Ref(Cint(0))
     deflatep = Ref(Cint(0))

diff --git a/src/types.jl b/src/types.jl
@@ -38,5 +38,4 @@ mutable struct NCDataset{TDS,T_experimental_missing_value} <: AbstractNCDataset
     _experimental_missing_value::T_experimental_missing_value
 end
 
-"Alias to `NCDataset`"
 const Dataset = NCDataset
diff --git a/test/perf/README.md b/test/perf/README.md
@@ -0,0 +1,58 @@
+# Benchmarks
+
+The operating systems typically caches access to the file system.
+To make these benchmarks more realistic, the file system caches is dropped at every iteration so that the disk IO *is* included in the reported run times.
+On Linux, the caches are dropped by writing `3` to the file `/proc/sys/vm/drop_caches` however this requires super user privileges.
+These benchmarks require a Linux operating system (as dropping file caches is OS-specific).
+
+
+## Installation
+
+### Julia packages
+
+Within a Julia shell install `BenchmarkTools` and `NCDatasets` using these julia commands:
+
+```julia
+using Pkg
+Pkg.add(["BenchmarkTools","NCDatasets"])
+```
+
+### Python packages
+
+Install the python packages `netCDF4` and `numpy` using this shell command:
+
+```bash
+pip install netCDF4 numpy
+```
+
+### R packages
+
+Within a R shell install `microbenchmark` and `ncdf4` using these R commands:
+```R
+install.packages("microbenchmark")
+install.packages("ncdf4")
+```
+
+## Running the benchmark
+
+These are the steps to run the benchmark:
+
+* Prepare the file `filename_fv.nc` with:
+
+```bash
+julia generate_data.jl
+```
+
+* As a *root user*, run the shell script `benchmark.sh`. It is necessary that the root user has access to the Julia, python and R netCDF packages (NCDatasets, netCDF4 and ncdf4 respectively).
+
+```bash
+./benchmark.sh
+```
+
+If all packages are installed in the home directory of an unpriviledges user e.g. `my_user_name`, they can be made available to the root user changing temporarily the `HOME` environement variable to `/home/my_user_name` in the root shell before running `./benchmark.sh`:
+
+```bash
+HOME=/home/my_user_name ./benchmark.sh
+```
+
+The script will output a markdown table with the benchmark statistics.
diff --git a/test/perf/benchmark-R-ncdf4.R b/test/perf/benchmark-R-ncdf4.R
@@ -1,4 +1,4 @@
-# Install dependencies via:
+# Install dependencies via the R commands:
 #
 # install.packages("microbenchmark")
 # install.packages("ncdf4")
@@ -11,6 +11,11 @@ print(paste("ncdf4 version: ",packageVersion("ncdf4")))
 fname = "filename_fv.nc"
 
 process <- function(fname) {
+  # drop file caches; requires root
+  fileConn<-file("/proc/sys/vm/drop_caches",open = "wt")
+  writeLines("3", fileConn)
+  close(fileConn)
+
   nc = nc_open(fname)
 
   # how do you get the dimension from the file?

diff --git a/test/perf/benchmark-RNetCDF.R b/test/perf/benchmark-RNetCDF.R
diff --git a/test/perf/benchmark-julia-NCDatasets.jl b/test/perf/benchmark-julia-NCDatasets.jl
@@ -14,6 +14,9 @@ function compute(v)
 end
 
 function process(fname)
+    # drop file caches; requires root
+    write("/proc/sys/vm/drop_caches","3")
+
     ds = NCDataset(fname,"r") do ds
         v = ds["v1"];
         tot = compute(v)
@@ -24,7 +27,7 @@ end
 fname = "filename_fv.nc"
 tot = process(fname)
 
-@show tot
+println("result ",tot)
 
 bm = run(@benchmarkable process(fname) samples=100 seconds=10000)
 

diff --git a/test/perf/benchmark-python-netCDF4.py b/test/perf/benchmark-python-netCDF4.py
@@ -1,3 +1,7 @@
+# Install dependencies via the shell commands:
+#
+# pip install netCDF4 numpy
+
 import netCDF4
 import numpy as np
 import timeit
@@ -10,20 +14,25 @@ def compute(v):
     return tot/v.shape[0]
 
 def process(fname):
+    with open("/proc/sys/vm/drop_caches","w") as f:
+        f.write("3")
+
     with netCDF4.Dataset(fname) as ds:
         v = ds["v1"]
         tot = compute(v)
         return tot
 
-def process_example():
+
+if __name__ == "__main__":
     fname = "filename_fv.nc";
-    process(fname)
+    tot = process(fname)
 
+    print("result ",tot)
 
-setup = "from __main__ import process_example"
-print("python-netCDF4 version ",netCDF4.__version__)
+    setup = "from __main__ import process"
+    print("python-netCDF4 version ",netCDF4.__version__)
 
-benchtime = timeit.repeat("process_example()", setup=setup,number = 1, repeat = 100)
-with open("python-netCDF4.txt","w") as f:
-    for bt in benchtime:
-        print(bt,file=f)
+    benchtime = timeit.repeat(lambda: process(fname), setup=setup,number = 1, repeat = 100)
+    with open("python-netCDF4.txt","w") as f:
+        for bt in benchtime:
+            print(bt,file=f)