Skip to content

Commit

Permalink
Merge branch 'joss'
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-Barth committed Jul 16, 2024
2 parents dde1dd4 + 189a3ad commit 5e3c297
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 0 deletions.
136 changes: 136 additions & 0 deletions paper/paper.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
@manual{OGC_netCDF,
organization = {Open Geospatial Consortium},
title = {{NetCDF Binary Encoding Extension Standard: NetCDF Classic and 64-bit Offset Format}},
year = 2011,
month = 4,
number = {OGC 10-092r3},
url = {http://www.opengis.net/doc/IS/netcdf-binary/1.0},
}

@inproceedings{Rew2006,
author = {Russell, Rew and Hartnett, Edward and Caron, John},
year = {2006},
month = {01},
organization = {Conference: 22nd International Conference on Interactive Information Processing Systems for Meteorology, Oceanography, and Hydrology},
title = {{NetCDF-4: Software Implementing an Enhanced Data Model for the Geosciences}}
}

@article{Rew90,
author={Rew, R. and Davis, G.},
journal={IEEE Computer Graphics and Applications},
title={{NetCDF: an interface for scientific data access}},
year={1990},
volume={10},
number={4},
pages={76-82},
doi={10.1109/38.56302}
}

@manual{OGC_Zarr,
organization = {Open Geospatial Consortium},
title = {{Zarr Storage Specification 2.0 Community Standard}},
year = 2022,
month = 6,
number = {21-050r1},
url = {http://www.opengis.net/doc/CS/zarr/2.0}
}


@article{Barth2022,
author = {A. Barth and A. Alvera-Azc{\'{a}}rate and C. Troupin and J.-M. Beckers},
title = {{DINCAE} 2.0: multivariate convolutional neural network with error estimates to reconstruct sea surface temperature satellite and altimetry observations},
year = 2022,
publisher = {Copernicus {GmbH}},
journal = {Geoscientific Model Development},
doi = {10.5194/gmd-2021-353},
}

@article{Doglioni2023,
AUTHOR = {Doglioni, F. and Ricker, R. and Rabe, B. and Barth, A. and Troupin, C. and Kanzow, T.},
TITLE = {{Sea surface height anomaly and geostrophic current velocity from altimetry measurements over the Arctic Ocean (2011--2020)}},
JOURNAL = {Earth System Science Data},
VOLUME = {15},
YEAR = {2023},
NUMBER = {1},
PAGES = {225--263},
DOI = {10.5194/essd-15-225-2023}
}

@article{Belgacem21,
AUTHOR = {Belgacem, M. and Schroeder, K. and Barth, A. and Troupin, C. and Pavoni, B. and Raimbault, P. and Garcia, N. and Borghini, M. and Chiggiato, J.},
TITLE = {{Climatological distribution of dissolved inorganic nutrients in the western Mediterranean Sea (1981--2017)}},
JOURNAL = {Earth System Science Data},
VOLUME = {13},
YEAR = {2021},
NUMBER = {12},
PAGES = {5915--5949},
URL = {https://essd.copernicus.org/articles/13/5915/2021/},
DOI = {10.5194/essd-13-5915-2021}
}

@article{OceananigansJOSS,
author = {Ali Ramadhan and Gregory LeClaire Wagner and Chris Hill and Jean-Michel Campin and Valentin Churavy and Tim Besard and Andre Souza and Alan Edelman and Raffaele Ferrari and John Marshall},
title = {{Oceananigans.jl: Fast and friendly geophysical fluid dynamics on GPUs}},
year = {2020},
publisher = {The Open Journal},
volume = {5},
number = {53},
pages = {2018},
journal = {Journal of Open Source Software},
doi = {10.21105/joss.02018},
url = {https://doi.org/10.21105/joss.02018}
}

@ARTICLE{Shahzadi21,
AUTHOR={Shahzadi, K. and Pinardi, N. and Barth, A. and Troupin, C. and Lyubartsev, V. and Simoncelli, S.},
TITLE={{A New Global Ocean Climatology}},
JOURNAL={Frontiers in Environmental Science},
VOLUME={9},
YEAR={2021},
URL={https://www.frontiersin.org/articles/10.3389/fenvs.2021.711363},
DOI={10.3389/fenvs.2021.711363},
ISSN={2296-665X}
}


@misc{NCDatasets,
author = {Alexander Barth},
title = {{NCDatasets: A julia package for manipulating netCDF data sets}},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/Alexander-Barth/NCDatasets.jl}},
commit = {90ed5641684604096558a77020038583e1f2459f}
}


@unpublished{SpeedyWeather,
author = {Milan Klöwer and Maximilian Gelbrecht and Daisuke Hotta and Justin Willmert and Simone Silvestri and Gregory L Wagner and Alistair White and Sam Hatfield and Tom Kimpson and Navid C Constantinou and Chris Hill},
title = {{SpeedyWeather.jl: Reinventing atmospheric general circulation models towards interactivity and extensibility}},
year = {2023},
publisher = {The Open Journal},
journal = {Journal of Open Source Software (submitted)},
}


@misc{Eaton2023,
author = {Brian Eaton and Jonathan Gregory and Bob Drach and Karl Taylor and Steve Hankin and Jon Blower and John Caron and Rich Signell and Phil Bentley and Greg Rappa and Heinke Höck and Alison Pamment and Martin Juckes and Martin Raspaud and Randy Horne and Timothy Whiteaker and David Blodgett and Charlie Zender and Daniel Lee and David Hassell and Alan D. Snow and Tobias Kölling and Dave Allured and Aleksandar Jelenak and Anders Meier Soerensen and Lucile Gaultier and Sylvain Herlédan and Fernando Manzano and Lars Bärring and Christopher Barker and Sadie Bartholomew},
title = {{NetCDF Climate and Forecast (CF) Metadata Conventions v1.11}},
publisher = {CF Conventions Committee},
year = 2023,
urldate = {2023-12-05},
url = {http://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html},
}


@article{Hassell2017,
AUTHOR = {Hassell, D. and Gregory, J. and Blower, J. and Lawrence, B. N. and Taylor, K. E.},
TITLE = {{A data model of the Climate and Forecast metadata conventions (CF-1.6) with a software implementation (cf-python v2.1)}},
JOURNAL = {Geoscientific Model Development},
VOLUME = {10},
YEAR = {2017},
NUMBER = {12},
PAGES = {4619--4646},
URL = {https://gmd.copernicus.org/articles/10/4619/2017/},
DOI = {10.5194/gmd-10-4619-2017}
}
79 changes: 79 additions & 0 deletions paper/paper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
---
title: 'NCDatasets.jl: a Julia package for manipulating netCDF data sets'
tags:
- julia
- netcdf
- oceanography
- meteorology
- earth-observation
- climatology
- opendap
- climate-and-forecast-conventions
authors:
- name: Alexander Barth
orcid: 0000-0003-2952-5997
affiliation: 1
affiliations:
- name: GHER, University of Liège, Liège, Belgium
index: 1
date: 13 January 2024
bibliography: paper.bib
---

# Summary

NCDatasets is a Julia package that allows users to read, create and modify netCDF files (Network Common Data Format). It is based on the Unidata netCDF library [@Rew90; Rew2006; @OGC_netCDF] which also supports reading data from remote servers using OPeNDAP (Open-source Project for a Network Data Access Protocol, https://www.opendap.org) and the Zarr file format [@OGC_Zarr]. These additional formats are also accessible to users of NCDatasets.

The aim of NCDatasets is to expose the data and metadata stored in the NetCDF file as lazy data-structures (in particular arrays and dictionaries) used in Julia.
Lazy in this context means that only the requested subset of data is loaded into RAM or written to the disk. One of the design goals of NCDatasets and the netCDF library in general is being able to work with datasets which are potentially larger than the total amount of RAM in a system and to process that data per subset.

NetCDF allows users to add metadata to datasets and individual variables in form of a list of key value-pairs called attributes. The meaning of these attributes is
standardized in the CF conventions [@Eaton2023]. While originally proposed for NetCDF files, the CF conventions are now also applied in the context of other formats like GRIB (e.g. the Julia package [GRIBDatasets](https://github.com/JuliaGeo/GRIBDatasets.jl) or the python package [cfgrib](https://github.com/ecmwf/cfgrib)).


# Statement of need

NetCDF is a commonly used data format in Earth sciences (in particular oceanography, atmospheric sciences and climatology) to store model data, satellite observations and in situ observations. It is particularly well established as a format for distributing and archiving data. The Julia programming language with its native array types, just-in-time compilation and automatic function specialization based on data types are well suited for processing and analyzing large amounts of data often found in Earth sciences.
Therefore, a convenient API mapping the concepts for the NetCDF format and CF convention to the corresponding equivalents of the Julia programming language is desirable.
There are currently 64 registered Julia packages (as for 15 January 2024) that have NCDatasets as direct or indirect dependency (not counting for optional dependencies).
For example, NCDatasets is used with satellite data [@Barth2022; @Doglioni2023], in situ observations [@Belgacem21; @Shahzadi21] as well as numerical ocean models [@OceananigansJOSS] and atmospheric models [@SpeedyWeather].


# Installation

NCDatasets supports Julia 1.6 and later and can be installed with the Julia package manager using the following Julia commands:

```julia
using Pkg
Pkg.add("NCDatasets")
```

This will automatically install all dependencies and in particular the Unidata netCDF C library for which compiled binaries are currently available for Linux, FreeBSD, Mac OS and Windows thanks to the efforts of the [Yggdrasil.jl](https://github.com/JuliaPackaging/Yggdrasil/) project.

# Features

The main objects in the netCDF data model are the dataset (typically representing a whole file), variables (named n-dimensional arrays with named dimensions), dimensions (mapping the dimension names to the corresponding length), attributes and groups (a dataset contained within a dataset). Groups can be recursively nested. Variable names must be unique within a given group, but in two different groups, variable names can be re-used. Current features of NCDatasets include:

* Attributes, dimensions and groups are exposed to users as dictionary-like objects. Modifying them will directly modify the underlying NetCDF file as long as the file is open in write mode.
* Variables are exposed as array-like objects. Indexing these arrays with the usual Julia syntax will result in loading the corresponding subset into memory. Likewise, assigning a value to a subset will write the data to the disk.
* The netCDF C API provides several functions to query information about the various objects of the netCDF data model. It is possible to query the data and metadata of a NetCDF file in the same way that one would query an array or dictionary.
* Every time a netCDF variable is loaded the required memory is automatically allocated. Once this memory is no longer used it will be deallocated by Julia's garbage collector. For high-performance applications, the repeated allocation and deallocation can cause a significant performance overhead. For this use-case, NCDatasets provides in-place variants for loading data.
* Data stored in a contiguous ragged array representation [@Hassell2017; @Eaton2023] are loaded as a vector of vectors. It is typically used to load a list of in situ profiles or time series, each of different length.
* Storage parameters like compression and data chunks can be queried and defined.
* Data transformations defined via the CF conventions are applied per default (including scaling, adding an offset, conversion to the `DateTime` structure). Several calendars are standardized in the CF conventions (standard, Gregorian, proleptic Gregorian, Julian, all leap, no leap, 360 day). Where possible, dates are automatically converted to Julia's native date time type, which uses the proleptic Gregorian calendar conforming to the ISO 8601 standard. Date types are handled using the package [CFTimes](https://github.com/JuliaGeo/CFTimes.jl) (originally part of NCDatasets)
* Additional functionality includes multi-file support (virtually concatenating variables of multiple NetCDF variable spanning over multiple files), a view of the variable and datasets (virtual subset without loading the whole data in memory), subset variables and dataset using coordinate values instead of indices using the package [CommonDataModel](https://github.com/JuliaGeo/CommonDataModel.jl) (also originally part of NCDatasets).


# Similar software

The Julia package [NetCDF.jl](https://github.com/JuliaGeo/NetCDF.jl) from Fabian Gans and contributors is an alternative to this package which supports a more Matlab/Octave-like interface for reading and writing netCDF files while this package, NCDatasets, is more influenced by the python [netCDF4](https://github.com/Unidata/netcdf4-python) package. In the R community, the packages [RNetCDF](https://github.com/mjwoods/RNetCDF) and [ncdf4](https://cirrus.ucsd.edu/~pierce/ncdf/) fulfill a similar role.

# Acknowledgements

I thank [all contributors](https://github.com/Alexander-Barth/NCDatasets.jl/graphs/contributors) to this package, among others, George Datseris, Tristan Carion, Martijn Visser, Charles Troupin, Rafael Schouten, Argel Ramírez Reyes, Kenechukwu Uba, Philippe Roy, Gregory L. Wagner, Gael Forget and Haakon Ludvig Langeland Ervik as well as Unidata for the [netCDF C library](https://github.com/Unidata/netcdf-c) and their time and efforts responding to my questions and issues. All contributors to the [Yggdrasil.jl](https://github.com/JuliaPackaging/Yggdrasil/) project for their effort in building the netCDF library and the required dependencies are also acknowledged.

# Funding

Acknowledgment is given to the F.R.S.-FNRS (Fonds de la Recherche Scientifique de Belgique) for funding the position of Alexander Barth. This work was partly performed with funding from the Blue-Cloud 2026 project under the Horizon Europe programme, Grant Agreement No. 101094227.

# References
Binary file added paper/paper.pdf
Binary file not shown.

0 comments on commit 5e3c297

Please sign in to comment.