diff --git a/README.md b/README.md
index cb9d7f41..5bf12a4e 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,3 @@ Clone the repository and install the code from source or use the Python package
 
 ## Documentation 
 https://qmctorch.readthedocs.io/en/latest/intro.html
-
-
-## Disclaimer
-QMCTorch is currently under developmement and most likely won't behave as expected 
diff --git a/docs/rst/install.rst b/docs/rst/install.rst
index ba2c6b83..3b0a0fe1 100644
--- a/docs/rst/install.rst
+++ b/docs/rst/install.rst
@@ -26,7 +26,7 @@ To install the code
 
 You can then test the installation :
 
- * ``cd test``
+ * ``cd tests``
  * ``pytest``
 
 
diff --git a/paper/paper.bib b/paper/paper.bib
index 629081b7..9b0a3bcf 100644
--- a/paper/paper.bib
+++ b/paper/paper.bib
@@ -82,7 +82,7 @@ @article{pyqmc
 	pages = {114801},
 	author = {William A. Wheeler and Shivesh Pathak and Kevin G. Kleiner and Shunyue Yuan and Jo{\~{a}
 }o N. B. Rodrigues and Cooper Lorsung and Kittithat Krongchon and Yueqing Chang and Yiqing Zhou and Brian Busemeyer and Kiel T. Williams and Alexander Mu{\~{n}}oz and Chun Yu Chow and Lucas K. Wagner},
-	title = {$\less$tt$\greater${PyQMC}$\less$/tt$\greater$: An all-Python real-space quantum Monte Carlo module in $\less$tt$\greater${PySCF}$\less$/tt$\greater$},
+	title = {PyQMC: An all-Python real-space quantum Monte Carlo module in PySCF},
 	journal = {The Journal of Chemical Physics}
 }
 
@@ -135,6 +135,123 @@ @Article{adf
   Url                      = {http://dx.doi.org/10.1002/jcc.1056}
 }
 
+@article{ANN_WF,
+author = {Yang, Peng-Jian and Sugiyama, Mahito and Tsuda, Koji and Yanai, Takeshi},
+title = {Artificial Neural Networks Applied as Molecular Wave Function Solvers},
+journal = {Journal of Chemical Theory and Computation},
+volume = {16},
+number = {6},
+pages = {3513-3529},
+year = {2020},
+doi = {10.1021/acs.jctc.9b01132},
+    note ={PMID: 32320233},
+URL = { https://doi.org/10.1021/acs.jctc.9b01132},
+eprint = {https://doi.org/10.1021/acs.jctc.9b01132}
+}
+
+@article{Lin_2023,
+	doi = {10.1016/j.jcp.2022.111765},
+	url = {https://doi.org/10.1016%2Fj.jcp.2022.111765},
+	year = 2023,
+	month = {feb},
+	publisher = {Elsevier {BV}},
+	volume = {474},
+	pages = {111765},
+	author = {Jeffmin Lin and Gil Goldshlager and Lin Lin},
+	title = {Explicitly antisymmetrized neural network layers for variational Monte Carlo simulation},
+	journal = {Journal of Computational Physics}
+}
+
+@article{fixed_node,
+    author = {Schätzle, Z. and Hermann, J. and Noé, F.},
+    title = "{Convergence to the fixed-node limit in deep variational Monte Carlo}",
+    journal = {The Journal of Chemical Physics},
+    volume = {154},
+    number = {12},
+    pages = {124108},
+    year = {2021},
+    month = {03},
+    abstract = "{Variational quantum Monte Carlo (QMC) is an ab initio method for solving the electronic Schrödinger equation that is exact in principle, but limited by the flexibility of the available Ansätze in practice. The recently introduced deep QMC approach, specifically two deep-neural-network Ansätze PauliNet and FermiNet, allows variational QMC to reach the accuracy of diffusion QMC, but little is understood about the convergence behavior of such Ansätze. Here, we analyze how deep variational QMC approaches the fixed-node limit with increasing network size. First, we demonstrate that a deep neural network can overcome the limitations of a small basis set and reach the mean-field (MF) complete-basis-set limit. Moving to electron correlation, we then perform an extensive hyperparameter scan of a deep Jastrow factor for LiH and H4 and find that variational energies at the fixed-node limit can be obtained with a sufficiently large network. Finally, we benchmark MF and many-body Ansätze on H2O, increasing the fraction of recovered fixed-node correlation energy of single-determinant Slater–Jastrow-type Ansätze by half an order of magnitude compared to previous variational QMC results, and demonstrate that a single-determinant Slater–Jastrow-backflow version of the Ansatz overcomes the fixed-node limitations. This analysis helps understand the superb accuracy of deep variational Ansätze in comparison to the traditional trial wavefunctions at the respective level of theory and will guide future improvements of the neural-network architectures in deep QMC.}",
+    issn = {0021-9606},
+    doi = {10.1063/5.0032836},
+    url = {https://doi.org/10.1063/5.0032836},
+    eprint = {https://pubs.aip.org/aip/jcp/article-pdf/doi/10.1063/5.0032836/14009445/124108\_1\_online.pdf},
+}
+
+@article{detfree_nn,
+  title = {Determinant-free fermionic wave function using feed-forward neural networks},
+  author = {Inui, Koji and Kato, Yasuyuki and Motome, Yukitoshi},
+  journal = {Phys. Rev. Res.},
+  volume = {3},
+  issue = {4},
+  pages = {043126},
+  numpages = {9},
+  year = {2021},
+  month = {Nov},
+  publisher = {American Physical Society},
+  doi = {10.1103/PhysRevResearch.3.043126},
+  url = {https://link.aps.org/doi/10.1103/PhysRevResearch.3.043126}
+}
+
+
+@article{ANN_QMC,
+author = {Kessler, Jan and Calcavecchia, Francesco and Kühne, Thomas D.},
+title = {Artificial Neural Networks as Trial Wave Functions for Quantum Monte Carlo},
+journal = {Advanced Theory and Simulations},
+volume = {4},
+number = {4},
+pages = {2000269},
+keywords = {Monte Carlo simulations, quantum Monte Carlo simulations, variational Monte Carlo simulations},
+doi = {https://doi.org/10.1002/adts.202000269},
+url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/adts.202000269},
+eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/adts.202000269},
+abstract = {Abstract Inspired by the universal approximation theorem and widespread adoption of artificial neural network techniques in a diversity of fields, feed-forward neural networks are proposed as a general purpose trial wave function for quantum Monte Carlo simulations of continuous many-body systems. Whereas for simple model systems the whole many-body wave function can be represented by a neural network, the antisymmetry condition of non-trivial fermionic systems is incorporated by means of a Slater determinant. To demonstrate the accuracy of the trial wave functions, an exactly solvable model system of two trapped interacting particles, as well as the hydrogen dimer, is studied.},
+year = {2021}
+}
+
+@article{HAN2019108929,
+title = {Solving many-electron Schrödinger equation using deep neural networks},
+journal = {Journal of Computational Physics},
+volume = {399},
+pages = {108929},
+year = {2019},
+issn = {0021-9991},
+doi = {https://doi.org/10.1016/j.jcp.2019.108929},
+url = {https://www.sciencedirect.com/science/article/pii/S0021999119306345},
+author = {Jiequn Han and Linfeng Zhang and Weinan E},
+keywords = {Schrödinger equation, Variational Monte Carlo, Deep neural networks, Trial wave-function},
+abstract = {We introduce a new family of trial wave-functions based on deep neural networks to solve the many-electron Schrödinger equation. The Pauli exclusion principle is dealt with explicitly to ensure that the trial wave-functions are physical. The optimal trial wave-function is obtained through variational Monte Carlo and the computational cost scales quadratically with the number of electrons. The algorithm does not make use of any prior knowledge such as atomic orbitals. Yet it is able to represent accurately the ground-states of the tested systems, including He, H2, Be, B, LiH, and a chain of 10 hydrogen atoms. This opens up new possibilities for solving large-scale many-electron Schrödinger equation.}
+}
+
+@article{choo_fermionic_2020,
+	title = {Fermionic neural-network states for ab-initio electronic structure},
+	volume = {11},
+	issn = {2041-1723},
+	url = {https://doi.org/10.1038/s41467-020-15724-9},
+	doi = {10.1038/s41467-020-15724-9},
+	abstract = {Neural-network quantum states have been successfully used to study a variety of lattice and continuous-space problems. Despite a great deal of general methodological developments, representing fermionic matter is however still early research activity. Here we present an extension of neural-network quantum states to model interacting fermionic problems. Borrowing techniques from quantum simulation, we directly map fermionic degrees of freedom to spin ones, and then use neural-network quantum states to perform electronic structure calculations. For several diatomic molecules in a minimal basis set, we benchmark our approach against widely used coupled cluster methods, as well as many-body variational states. On some test molecules, we systematically improve upon coupled cluster methods and Jastrow wave functions, reaching chemical accuracy or better. Finally, we discuss routes for future developments and improvements of the methods presented.},
+	number = {1},
+	journal = {Nature Communications},
+	author = {Choo, Kenny and Mezzacapo, Antonio and Carleo, Giuseppe},
+	month = may,
+	year = {2020},
+	pages = {2368},
+}
+
+@article{backflow_1981,
+  title = {Structure of the Ground State of a Fermion Fluid},
+  author = {Schmidt, K. E. and Lee, Michael A. and Kalos, M. H. and Chester, G. V.},
+  journal = {Phys. Rev. Lett.},
+  volume = {47},
+  issue = {11},
+  pages = {807--810},
+  numpages = {0},
+  year = {1981},
+  month = {Sep},
+  publisher = {American Physical Society},
+  doi = {10.1103/PhysRevLett.47.807},
+  url = {https://link.aps.org/doi/10.1103/PhysRevLett.47.807}
+}
 
 
 @article{jacobi_trace,
diff --git a/paper/paper.md b/paper/paper.md
index 2fed39c1..f13039f1 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -27,21 +27,21 @@ in a physically-motivated neural network. The use of `PyTorch` as a backend to p
 
 # Statement of need
 
-`QMCTorch` is a Python package using `PyTorch` [@pytorch] as a backend to perform Quantum Monte-Carlo (QMC) simulations, namely Variational Monte-Carlo,  of molecular systems. Many software such as `QMCPack`[@qmcpack], `QMC=Chem` [@qmcchem], `CHAMP` [@champ] provide high-quality implementation of advanced QMC methodologies in low-level languages (C++/Fortran).  Python implementations of QMC such as `PAUXY` [@pauxy] and `PyQMC` [@pyqmc] have also been proposed to facilitate the use and development of QMC techniques. Large efforts have been made to leverage recent development of deep learning techniques for QMC simulations with for example the creation of neural-network based wave-function ansatz [@paulinet; @ferminet] that have lead to very interesting results. `QMCTorch` allows to perform QMC simulations using physically motivated neural network architectures that closely follow the wave function ansatz used by QMC practitioners. Its architecture allows to rapidly explore new functional forms of some key elements of the wave function ansatz. Users do not need to derive analytical expressions for the gradients of the total energy w.r.t. the variational parameters, that are simply obtained via automatic diffentiation. This includes for example the parameters of the atomic orbitals that can be varioationally optimized and the atomic coordinates that allows `QMCTorch` to perform geometry optimization of molecular structures. In addition, the GPU capabilities offered by `PyTorch` combined with the parallelization over multiple computing nodes obtained via `Horovod` [@horovod], allow to deploy the simulations on large heterogenous computing architectures. In summary, `QMCTorch` provides QMC practitionners a framework to rapidly protoytpe new ideas and to test them using modern computing ressources.
+`QMCTorch` is a Python package using `PyTorch` [@pytorch] as a backend to perform Quantum Monte-Carlo (QMC) simulations, namely Variational Monte-Carlo,  of molecular systems. Many software such as `QMCPack`[@qmcpack], `QMC=Chem` [@qmcchem], `CHAMP` [@champ] provide high-quality implementation of advanced QMC methodologies in low-level languages (C++/Fortran).  Python implementations of QMC such as `PAUXY` [@pauxy] and `PyQMC` [@pyqmc] have also been proposed to facilitate the use and development of QMC techniques. Large efforts have been made to leverage recent development of deep learning techniques for QMC simulations with for example the creation of neural-network based wave-function ansatz [@paulinet; @ferminet; @choo_fermionic_2020; @HAN2019108929; @ANN_QMC; @detfree_nn; @fixed_node; @Lin_2023; @ANN_WF] that have lead to very interesting results. `QMCTorch` allows to perform QMC simulations using physically motivated neural network architectures that closely follow the wave function ansatz used by QMC practitioners. Its architecture allows to rapidly explore new functional forms of some key elements of the wave function ansatz. Users do not need to derive analytical expressions for the gradients of the total energy w.r.t. the variational parameters, that are simply obtained via automatic diffentiation. This includes for example the parameters of the atomic orbitals that can be variationally optimized and the atomic coordinates that allows `QMCTorch` to perform geometry optimization of molecular structures. In addition, the GPU capabilities offered by `PyTorch` combined with the parallelization over multiple computing nodes obtained via `Horovod` [@horovod], allow to deploy the simulations on large heterogenous computing architectures. In summary, `QMCTorch` provides QMC practitionners a framework to rapidly protoytpe new ideas and to test them using modern computing ressources.
 
 
 # Wave Function Ansatz
 ![General architecture of the neural network used by `QMCTorch` to encode the wave function ansatz. The neural network computes and assembles the different elements of the wave function ansatz and can be used to compute the electronic density required for the sampling and the total energy of the system required for the wave function optimization.\label{fig:arch}](qmctorch2.png)
 
-The neural network used to encode the wave-function ansatz used in `QMCTorch` is shown in Fig. \ref{fig:arch}. As common in QMC simulations, the wave function is given by the product of a Jastrow factor, $J(r)$, that accounts for electronic correlations and a sum of Slater determinants, $D^\updownarrow(r_\updownarrow)$, built over the molecular orbitals of the spin up and down electrons:  $\Psi(r) = J(r)\sum_n c_n D^\uparrow(r_\uparrow)D^\downarrow(r_\downarrow)$.
+The neural network used to encode the wave-function ansatz used in `QMCTorch` is shown in Fig. \ref{fig:arch}. As common in QMC simulations, the wave function is given by the product of a Jastrow factor, $J(r)$, that accounts for electronic correlations and a sum of Slater determinants, $D^\updownarrow(r_\updownarrow)$, built over the molecular orbitals of the spin up and down electrons:  $\Psi(r) = J(r)\sum_n c_n D_n^\uparrow(r_\uparrow)D_n^\downarrow(r_\downarrow)$.
 
 **Jastrow Factor** The `Jastrow` layer computes the sum of three components: an electron-electron term $K_{ee}$; an electron-nuclei term $K_{en}$; and a three body electron-electron-nuclei term $K_{een}$. The sum is then exponentiated to give the Jastrow factor: $J(r_{ee}, r_{en}) = \exp\left( K_{ee}(r_{ee})+K_{en}(r_{en}) + K_{een}(r_{ee},r_{en})\right)$ where $r_{ee}$ and $r_{en}$ are the electron-electron and electron-nuclei distances. Several well-known Jastrow factor functional forms, as for example the electron-electron Pade-Jastrow: $K(r_{ee}) = \frac{\omega_0 r_{ee}}{1 + \omega r_{ee}}$, where $\omega$ is a variational parameter, are already implemented and available for use. Users can also define their own functional forms for the different kernel functions, $K$, and explore their effects on the resulting optimization.  
 
-**Backflow Transformation** The backflow transformation layer, `BF`, creates quasi-particles by mixing the electronic positions of the electrons: $q_i = r_i + \sum_{i\neq j} K_{BF}(r_{ij})(r_i-r_j)$ [@backflow]. Well-known transformations such as: $K_{BF} = \frac{\omega}{r_{ij}}$ where $\omega$ is a variational parameter, are already implemented and ready to use. Users can also easily specify the kernel of the backflow transformation, $K_{BF}$ to explore its impact on the wave function optimization.
+**Backflow Transformation** The backflow transformation layer, `BF`, creates quasi-particles by mixing the electronic positions of the electrons: $\mathbf{q}_i = \mathbf{r}_i + \sum_{i\neq j} K_{BF}(r_{ij})(\mathbf{r}_i-\mathbf{r}_j)$ [@backflow_1981; @backflow]. Well-known transformations such as: $K_{BF} = \frac{\mu}{r_{ij}}$ where $\mu$ is a variational parameter, are already implemented and ready to use. Users can also easily specify the kernel of the backflow transformation, $K_{BF}$ to explore its impact on the wave function optimization.
 
 **Atomic Orbitals** The Atomic Orbital layer `AO` computes the values of the different atomic orbitals of the system at all the positions $q_e$. Both Slater type orbitals (STOs) and Gaussian type orbitals (GTOs) are supported. The initial parameters of the AOs are extracted from popular quantum chemistry codes, `pyscf` [@pyscf] and `ADF` [@adf].  During the optimization, the parameters of the AOs (exponents, coefficients) are variational parameters that can be optimized to minimize the total energy. Since GTOs can introduce a significant amount of noise in the QMC simulations, `QMCTorch` offers the possibility to fit GTOs to single exponent STOs.
 
-**Molecular Orbitals** The Molecular Orbital layer, `MO`, computes the values of all the MOs at the positions of the quasi particles. The MO layer is a simple linear transformation defined by $\textnormal{MO} =  \textnormal{AO} \times W^T_{SCF}$, where $W^T_{SCF}$ is the matrix of the MOs coefficients on the AOs. The initial values of these coefficients are obtained from a Hartree-Fock (HF) or Density Functional Theory (DFT) calculation of the system via `pyscf` or `ADF`. These coefficients are then variational parameters that can be optimized to minimize the total energy of the system. 
+**Molecular Orbitals** The Molecular Orbital layer, `MO`, computes the values of all the MOs at the positions of the quasi particles. The MO layer is a simple linear transformation defined by $\textnormal{MO} =  \textnormal{AO} \cdot W^T_{SCF}$, where $W^T_{SCF}$ is the matrix of the MOs coefficients on the AOs. The initial values of these coefficients are obtained from a Hartree-Fock (HF) or Density Functional Theory (DFT) calculation of the system via `pyscf` or `ADF`. These coefficients are then variational parameters that can be optimized to minimize the total energy of the system. 
 
 **Slater Determinants** The Slater determinants layer, `SD`, extracts the spin up/down  matrices of the different electronic configurations specified by the user. Users can freely define the number of electrons as well as the number and types of excitations they want to include in the definition of their wave function ansatz. The `SD` layer will extract the corresponding matrices, multiply their determinants and sum all the terms. The `CI` coefficients of the sum can be freely initialized and optimized to minimize the total energy.
 
@@ -49,7 +49,7 @@ The Jastrow factor and the sum of Slater determinants are then multiplied to yie
 
 # Sampling, Cost Function & Optimization 
 
-QMC simulations use samples of the electronic density to approximate the total energy of the system. In `QMCTorch`, Markov-Chain Monte-Carlo (MCMC) techniques, namely Metropolis-Hasting and Hamiltonian Monte-Carlo, are used to obtained those sample. Each sample, $R_i$, contains the positions of all the electrons contained in the system. MCMC techniques require the calculation of the density for a given positions of the electrons: $\rho(R_i) = |\Psi(R_i)|^2$ that can simply obtained by squaring the result of a forward pass of the network described above.
+QMC simulations use samples of the electronic density to approximate the total energy of the system. In `QMCTorch`, Markov-Chain Monte-Carlo (MCMC) techniques, namely Metropolis-Hasting and Hamiltonian Monte-Carlo, are used to obtained those sample. Each sample, $R_i$, contains the positions of all the electrons contained in the system. MCMC techniques require the calculation of the density for a given positions of the electrons: $\Pi(R_i) = |\Psi(R_i)|^2$ that can simply obtained by squaring the result of a forward pass of the network described above.
 
 The value of local energy of the system is then computed at each sampling point and these values are summed up to compute the total energy of the system: $E = \sum_i \frac{H\Psi(R_i)}{\Psi(R_i)}$, where $H$ is the Hamiltonian of the molecular system: $H = -\frac{1}{2}\sum_i \Delta_i + V_{ee} + V_{en}$, with $\Delta_i$ the Laplacian w.r.t the i-th electron, $V_{ee}$ the coulomb potential between the electrons and $V_{en}$ the electron-nuclei potential. In `QMCTorch`, the calculation of the Laplacian of the Slater determinants can be performed using automatic differentiation but analytical expressions have also been implemented as they are computationally more robust and less expensive [@jacobi_trace]. The gradients of the total energy w.r.t the variational parameters of the wave function, i.e. $\frac{\partial E}{\partial \theta_i}$ are simply obtained via automatic differentiation. Thanks to this automatic differentiation, users can define new kernels for the backflow transformation and Jastrow factor without having to derive analytical expressions of the energy gradients.