From 2442f97c268637557d596a85df9c4c0cd0a18dc2 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Fri, 4 Oct 2024 12:59:05 -0700
Subject: [PATCH 01/18] Fix infinite recursion when builtin arg is string

---
 warp/codegen.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/warp/codegen.py b/warp/codegen.py
index df6fea737..9bb817adf 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -777,6 +777,9 @@ def func_match_args(func, arg_types, kwarg_types):
 
 
 def get_arg_type(arg: Union[Var, Any]):
+    if isinstance(arg, str):
+        return str
+
     if isinstance(arg, Sequence):
         return tuple(get_arg_type(x) for x in arg)
 

From bc41632730e9a4c40532d99521a6f71128f384cf Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Fri, 4 Oct 2024 16:41:08 -0700
Subject: [PATCH 02/18] Fix typo "the the"

---
 docs/modules/runtime.rst | 2 +-
 warp/native/bvh.cu       | 4 ++--
 warp/native/mesh.cu      | 4 ++--
 warp/types.py            | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/modules/runtime.rst b/docs/modules/runtime.rst
index bb5516422..3db80abd2 100644
--- a/docs/modules/runtime.rst
+++ b/docs/modules/runtime.rst
@@ -817,7 +817,7 @@ To record a series of kernel launches use the :func:`wp.capture_begin() <capture
         # end capture and return a graph object
         graph = wp.capture_end(device="cuda")
 
-We strongly recommend the use of the the try-finally pattern when capturing graphs because the `finally`
+We strongly recommend the use of the try-finally pattern when capturing graphs because the `finally`
 statement will ensure :func:`wp.capture_end <capture_end>` gets called, even if an exception occurs during
 capture, which would otherwise trap the stream in a capturing state.
 
diff --git a/warp/native/bvh.cu b/warp/native/bvh.cu
index b8bc69f64..6a67287b0 100644
--- a/warp/native/bvh.cu
+++ b/warp/native/bvh.cu
@@ -65,7 +65,7 @@ __global__ void bvh_refit_kernel(int n, const int* __restrict__ parents, int* __
             int finished = atomicAdd(&child_count[parent], 1);
 
             // if we have are the last thread (such that the parent node is now complete)
-            // then update its bounds and move onto the the next parent in the hierarchy
+            // then update its bounds and move onto the next parent in the hierarchy
             if (finished == 1)
             {
                 const int left_child = node_lowers[parent].i;
@@ -273,7 +273,7 @@ __global__ void build_hierarchy(int n, int* root, const int* __restrict__ deltas
             }
 
             // if we have are the last thread (such that the parent node is now complete)
-            // then update its bounds and move onto the the next parent in the hierarchy
+            // then update its bounds and move onto the next parent in the hierarchy
             if (childCount == 1)
             {
                 const int left_child = lowers[parent].i;
diff --git a/warp/native/mesh.cu b/warp/native/mesh.cu
index 4ebdf3f35..3bfac1819 100644
--- a/warp/native/mesh.cu
+++ b/warp/native/mesh.cu
@@ -101,7 +101,7 @@ __global__ void bvh_refit_with_solid_angle_kernel(int n, const int* __restrict__
             int finished = atomicAdd(&child_count[parent], 1);
 
             // if we have are the last thread (such that the parent node is now complete)
-            // then update its bounds and move onto the the next parent in the hierarchy
+            // then update its bounds and move onto the next parent in the hierarchy
             if (finished == 1)
             {
                 //printf("Compute non-leaf at %d\n", index);
@@ -340,4 +340,4 @@ void mesh_set_velocities_device(uint64_t id, wp::array_t<wp::vec3> velocities)
         fprintf(stderr, "The mesh id provided to mesh_set_velocities_device is not valid!\n");
         return;
     }
-}
\ No newline at end of file
+}
diff --git a/warp/types.py b/warp/types.py
index 28ab8301e..b5ff61c75 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -3465,7 +3465,7 @@ def get_feature_array_info(self, feature_index: int) -> Volume.FeatureArrayInfo:
         )
 
     def feature_array(self, feature_index: int, dtype=None) -> array:
-        """Returns one the the grid's feature data arrays as a Warp array
+        """Returns one the grid's feature data arrays as a Warp array
 
         Args:
             feature_index: Index of the supplemental data array in the grid

From c592e7c78eba0e34d56e9acc3e1666ea43370564 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Fri, 4 Oct 2024 16:43:12 -0700
Subject: [PATCH 03/18] Fix iteration typo

---
 warp/sim/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/warp/sim/model.py b/warp/sim/model.py
index 4d9df0fbc..6bd175bc3 100644
--- a/warp/sim/model.py
+++ b/warp/sim/model.py
@@ -641,7 +641,7 @@ class Model:
         joint_dof_count (int): Total number of velocity degrees of freedom of all joints in the system
         joint_coord_count (int): Total number of position degrees of freedom of all joints in the system
 
-        particle_coloring (list of array): The coloring of all the particles, used for VBD's Gauss-Seidel interation.
+        particle_coloring (list of array): The coloring of all the particles, used for VBD's Gauss-Seidel iteration.
 
         device (wp.Device): Device on which the Model was allocated
 

From 2604934f601601dde7002834a11914ea0fbae581 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Fri, 4 Oct 2024 16:43:41 -0700
Subject: [PATCH 04/18] Fix every time typo

---
 warp/tests/test_static.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/warp/tests/test_static.py b/warp/tests/test_static.py
index d816af4f5..9bae1e526 100644
--- a/warp/tests/test_static.py
+++ b/warp/tests/test_static.py
@@ -234,7 +234,7 @@ def function_variable_kernel(results: wp.array(dtype=int)):
             results[0] = wp.static(func)(3, 2)  # noqa: B023
 
         results = wp.zeros(1, dtype=int, device=device)
-        # note that the kernel has to be recompiled everytime the value of func changes
+        # note that the kernel has to be recompiled every time the value of func changes
         wp.launch(function_variable_kernel, 1, [results], device=device)
         assert_np_equal(results.numpy(), np.array([func(3, 2)], dtype=int))
 

From b6ccf70e56275e63e79136cd7d50eaa9b9dd9efe Mon Sep 17 00:00:00 2001
From: Christopher Crouzet <christopher@crouzet.pm>
Date: Tue, 1 Oct 2024 15:15:17 +1300
Subject: [PATCH 05/18] Fix `iter_reverse()` for ranges with steps != 1

Fixes GH-311.
---
 CHANGELOG.md        | 11 +++++++++++
 warp/native/range.h | 13 +++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ac1e54d8..570a2c50f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # CHANGELOG
 
+## [Unreleased] - 2024-??
+
+### Added
+
+### Changed
+
+### Fixed
+
+- Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+
+
 ## [1.4.0] - 2024-10-01
 
 ### Added
diff --git a/warp/native/range.h b/warp/native/range.h
index 408ad067b..5f6d05502 100644
--- a/warp/native/range.h
+++ b/warp/native/range.h
@@ -97,8 +97,17 @@ CUDA_CALLABLE inline range_t iter_reverse(const range_t& r)
 {
     // generates a reverse range, equivalent to reversed(range())
     range_t rev;
-    rev.start = r.end-1;
-    rev.end = r.start-1;
+
+    if (r.step > 0)
+    {
+        rev.start = r.start + int((r.end - r.start - 1) / r.step) * r.step;
+    }
+    else
+    {
+        rev.start = r.start + int((r.end - r.start + 1) / r.step) * r.step;
+    }
+
+    rev.end = r.start - r.step;
     rev.step = -r.step;
 
     rev.i = rev.start;

From 7a091e54255b210d816a061844e3fb3758345769 Mon Sep 17 00:00:00 2001
From: Gilles Daviet <gdaviet@nvidia.com>
Date: Mon, 7 Oct 2024 01:11:57 -0700
Subject: [PATCH 06/18] Fix edge cases in wp.sparse and eigenvector computation

---
 CHANGELOG.md               |  2 ++
 warp/examples/fem/utils.py |  3 ++-
 warp/fem/utils.py          | 13 +++++++++----
 warp/sparse.py             | 16 +++++++++-------
 warp/tests/test_fem.py     | 18 +++++++++++++++++-
 5 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 570a2c50f..2bd2baa19 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@
 
 - Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
 
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/warp/examples/fem/utils.py b/warp/examples/fem/utils.py
index c7c23f8fa..65119d89d 100644
--- a/warp/examples/fem/utils.py
+++ b/warp/examples/fem/utils.py
@@ -143,7 +143,7 @@ def gen_hexmesh(res, bounds_lo: Optional[wp.vec3] = None, bounds_hi: Optional[wp
 
     x = np.linspace(bounds_lo[0], bounds_hi[0], Nx + 1)
     y = np.linspace(bounds_lo[1], bounds_hi[1], Ny + 1)
-    z = np.linspace(bounds_lo[1], bounds_hi[1], Nz + 1)
+    z = np.linspace(bounds_lo[2], bounds_hi[2], Nz + 1)
 
     positions = np.transpose(np.meshgrid(x, y, z, indexing="ij"), axes=(1, 2, 3, 0)).reshape(-1, 3)
 
@@ -252,6 +252,7 @@ def print_callback(i, err, tol):
         check_every=check_every,
         M=M,
         callback=callback,
+        use_cuda_graph=not wp.config.verify_cuda,
     )
 
     if not quiet:
diff --git a/warp/fem/utils.py b/warp/fem/utils.py
index 7e1e3f5d3..5e12d582a 100644
--- a/warp/fem/utils.py
+++ b/warp/fem/utils.py
@@ -193,9 +193,14 @@ def _givens_rotation(a: Any, b: Any):
     # Givens rotation [[c -s], [s c]] such that sa+cb =0
     zero = type(a)(0.0)
     one = type(a)(1.0)
-    abn_sq = a * a + b * b
-    abn = wp.select(abn_sq == zero, one / wp.sqrt(abn_sq), zero)
-    return a * abn, -b * abn
+
+    b2 = b * b
+    if b2 == zero:
+        # id rotation
+        return one, zero
+
+    scale = one / wp.sqrt(a * a + b2)
+    return a * scale, -b * scale
 
 
 @wp.func
@@ -229,7 +234,7 @@ def tridiagonal_symmetric_eigenvalues_qr(D: Any, L: Any, Q: Any, tol: Any):
     x = D.dtype(0.0)  # coeff atop buldge
 
     for _ in range(32 * m):  # failsafe, usually converges faster than that
-        # Iterate over all idependant (deflated) blocks
+        # Iterate over all independent (deflated) blocks
         end = int(-1)
 
         for k in range(m - 1):
diff --git a/warp/sparse.py b/warp/sparse.py
index 37259e70a..0b86bd170 100644
--- a/warp/sparse.py
+++ b/warp/sparse.py
@@ -106,7 +106,7 @@ def _setup_nnz_transfer(self):
             return
 
         BsrMatrix.__setattr__(
-            self, "_nnz_buf", wp.zeros(dtype=int, shape=(1,), device="cpu", pinned=self.device.is_cuda)
+            self, "_nnz_buf", wp.empty(dtype=int, shape=(1,), device="cpu", pinned=self.device.is_cuda)
         )
         if self.device.is_cuda:
             BsrMatrix.__setattr__(self, "_nnz_event", wp.Event(self.device))
@@ -524,7 +524,7 @@ def _bsr_assign_split_blocks(
     if dest_block >= dest_offsets[dest_row_count]:
         return
 
-    dest_row = wp.lower_bound(dest_offsets, dest_block + 1) - 1
+    dest_row = wp.lower_bound(dest_offsets, 0, dest_row_count + 1, dest_block + 1) - 1
     src_row = dest_row // row_factor
 
     dest_col_in_row = dest_block - dest_offsets[dest_row]
@@ -566,7 +566,7 @@ def _bsr_assign_merge_row_col(
         dest_rows[block] = -1  # invalid
         dest_cols[block] = -1
     else:
-        row = wp.lower_bound(src_offsets, block + 1) - 1
+        row = wp.lower_bound(src_offsets, 0, src_row_count + 1, block + 1) - 1
         dest_rows[block] = row // row_factor
         dest_cols[block] = src_columns[block] // col_factor
 
@@ -589,7 +589,7 @@ def _bsr_assign_merge_blocks(
     if src_block >= src_offsets[src_row_count]:
         return
 
-    src_row = wp.lower_bound(src_offsets, src_block + 1) - 1
+    src_row = wp.lower_bound(src_offsets, 0, src_row_count + 1, src_block + 1) - 1
     src_col = src_columns[src_block]
 
     dest_row = src_row // row_factor
@@ -828,7 +828,7 @@ def bsr_copy(
         block_type=block_type,
         device=A.device,
     )
-    bsr_assign(dest=copy, src=A)
+    bsr_assign(dest=copy, src=A, structure_only=structure_only)
     return copy
 
 
@@ -1190,7 +1190,7 @@ def _bsr_get_block_row(dest_offset: int, row_count: int, bsr_offsets: wp.array(d
     if i >= bsr_offsets[row_count]:
         rows[dest_offset + i] = -1  # invalid
     else:
-        row = wp.lower_bound(bsr_offsets, i + 1) - 1
+        row = wp.lower_bound(bsr_offsets, 0, row_count + 1, i + 1) - 1
         rows[dest_offset + i] = row
 
 
@@ -1461,13 +1461,14 @@ def _bsr_mm_compute_values(
     y_offsets: wp.array(dtype=int),
     y_columns: wp.array(dtype=int),
     y_values: wp.array(dtype=Any),
+    mm_row_count: int,
     mm_offsets: wp.array(dtype=int),
     mm_cols: wp.array(dtype=int),
     mm_values: wp.array(dtype=Any),
 ):
     mm_block = wp.tid()
 
-    row = wp.lower_bound(mm_offsets, mm_block + 1) - 1
+    row = wp.lower_bound(mm_offsets, 0, mm_row_count + 1, mm_block + 1) - 1
     col = mm_cols[mm_block]
 
     mm_val = mm_values.dtype(type(alpha)(0.0))
@@ -1759,6 +1760,7 @@ def bsr_mm(
             work_arrays._old_z_offsets if y == z else y.offsets,
             work_arrays._old_z_columns if y == z else y.columns,
             work_arrays._old_z_values if y == z else y.values,
+            z.nrow,
             z.offsets,
             z.columns,
             mm_values,
diff --git a/warp/tests/test_fem.py b/warp/tests/test_fem.py
index 63e3cde91..e8e96ecea 100644
--- a/warp/tests/test_fem.py
+++ b/warp/tests/test_fem.py
@@ -28,6 +28,9 @@
 )
 from warp.tests.unittest_utils import *
 
+vec6f = wp.vec(length=6, dtype=float)
+mat66f = wp.mat(shape=(6, 6), dtype=float)
+
 
 @integrand
 def linear_form(s: Sample, u: Field):
@@ -1507,7 +1510,7 @@ def test_implicit_fields(test, device):
 
 @wp.kernel
 def test_qr_eigenvalues():
-    tol = 1.0e-6
+    tol = 1.0e-8
 
     # zero
     Zero = wp.mat33(0.0)
@@ -1546,6 +1549,19 @@ def test_qr_eigenvalues():
     Err4 = wp.transpose(P4) * wp.diag(D4) * P4 - Rank4
     wp.expect_near(wp.ddot(Err4, Err4), 0.0, tol)
 
+    # test robustness to low requested tolerance
+    Rank6 = mat66f(
+        vec6f(0.00171076, 0.0, 0.0, 0.0, 0.0, 0.0),
+        vec6f(0.0, 0.00169935, 6.14367e-06, -3.52589e-05, 3.02397e-05, -1.53458e-11),
+        vec6f(0.0, 6.14368e-06, 0.00172217, 2.03568e-05, 1.74589e-05, -2.92627e-05),
+        vec6f(0.0, -3.52589e-05, 2.03568e-05, 0.00172178, 2.53422e-05, 3.02397e-05),
+        vec6f(0.0, 3.02397e-05, 1.74589e-05, 2.53422e-05, 0.00171114, 3.52589e-05),
+        vec6f(0.0, 6.42993e-12, -2.92627e-05, 3.02397e-05, 3.52589e-05, 0.00169935),
+    )
+    D6, P6 = symmetric_eigenvalues_qr(Rank6, 0.0)
+    Err6 = wp.transpose(P6) * wp.diag(D6) * P6 - Rank6
+    wp.expect_near(wp.ddot(Err6, Err6), 0.0, 1.0e-13)
+
 
 @wp.kernel
 def test_qr_inverse():

From 7483cbfd20235cc98458f4a8cdfa5ef4b5a3acfa Mon Sep 17 00:00:00 2001
From: Eric Heiden <eheiden@nvidia.com>
Date: Mon, 7 Oct 2024 17:46:47 -0700
Subject: [PATCH 07/18] Fix codegen error when nesting dynamic and static
 for-loops

---
 CHANGELOG.md               |  1 +
 warp/codegen.py            | 25 +++++++++++++++++--------
 warp/tests/test_codegen.py | 31 +++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2bd2baa19..6da251b54 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
 
 - Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes
 - Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`
+- Fix invalid code generation error messages when nesting dynamic and static for-loops
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/warp/codegen.py b/warp/codegen.py
index 9bb817adf..7db6ecc5a 100644
--- a/warp/codegen.py
+++ b/warp/codegen.py
@@ -939,7 +939,7 @@ def build(adj, builder, default_builder_options=None):
 
         adj.return_var = None  # return type for function or kernel
         adj.loop_symbols = []  # symbols at the start of each loop
-        adj.loop_const_iter_symbols = set()  # iteration variables (constant) for static loops
+        adj.loop_const_iter_symbols = []  # iteration variables (constant) for static loops
 
         # blocks
         adj.blocks = [Block()]
@@ -1849,7 +1849,7 @@ def materialize_redefinitions(adj, symbols):
         # detect symbols with conflicting definitions (assigned inside the for loop)
         for items in symbols.items():
             sym = items[0]
-            if adj.loop_const_iter_symbols is not None and sym in adj.loop_const_iter_symbols:
+            if adj.is_constant_iter_symbol(sym):
                 # ignore constant overwriting in for-loops if it is a loop iterator
                 # (it is no problem to unroll static loops multiple times in sequence)
                 continue
@@ -2001,11 +2001,21 @@ def get_unroll_range(adj, loop):
         return range_call
 
     def begin_record_constant_iter_symbols(adj):
-        if adj.loop_const_iter_symbols is None:
-            adj.loop_const_iter_symbols = set()
+        if len(adj.loop_const_iter_symbols) > 0:
+            adj.loop_const_iter_symbols.append(adj.loop_const_iter_symbols[-1])
+        else:
+            adj.loop_const_iter_symbols.append(set())
 
     def end_record_constant_iter_symbols(adj):
-        adj.loop_const_iter_symbols = None
+        if len(adj.loop_const_iter_symbols) > 0:
+            adj.loop_const_iter_symbols.pop()
+
+    def record_constant_iter_symbol(adj, sym):
+        if len(adj.loop_const_iter_symbols) > 0:
+            adj.loop_const_iter_symbols[-1].add(sym)
+
+    def is_constant_iter_symbol(adj, sym):
+        return len(adj.loop_const_iter_symbols) > 0 and sym in adj.loop_const_iter_symbols[-1]
 
     def emit_For(adj, node):
         # try and unroll simple range() statements that use constant args
@@ -2013,9 +2023,8 @@ def emit_For(adj, node):
 
         if isinstance(unroll_range, range):
             const_iter_sym = node.target.id
-            if adj.loop_const_iter_symbols is not None:
-                # prevent constant conflicts in `materialize_redefinitions()`
-                adj.loop_const_iter_symbols.add(const_iter_sym)
+            # prevent constant conflicts in `materialize_redefinitions()`
+            adj.record_constant_iter_symbol(const_iter_sym)
 
             # unroll static for-loop
             for i in unroll_range:
diff --git a/warp/tests/test_codegen.py b/warp/tests/test_codegen.py
index e3552ad24..6beb4d542 100644
--- a/warp/tests/test_codegen.py
+++ b/warp/tests/test_codegen.py
@@ -503,6 +503,37 @@ def dynamic_loop_kernel(n: int, input: wp.array(dtype=float)):
     ):
         wp.launch(dynamic_loop_kernel, dim=1, inputs=[3, inputs], device=device)
 
+    # the following nested loop must not raise an error
+    const_a = 7
+    const_b = 5
+
+    @wp.kernel
+    def mixed_dyn_static_loop_kernel(dyn_a: int, dyn_b: int, dyn_c: int, output: wp.array(dtype=float, ndim=2)):
+        tid = wp.tid()
+        for i in range(const_a + 1):
+            for j in range(dyn_a + 1):
+                for k in range(dyn_b + 1):
+                    for l in range(const_b + 1):
+                        for m in range(dyn_c + 1):
+                            coeff = i + j + k + l + m
+                            output[tid, coeff] = 1.0
+
+    dyn_a, dyn_b, dyn_c = 3, 4, 5
+    num_threads = 10
+    output = wp.empty([num_threads, const_a + const_b + dyn_a + dyn_b + dyn_c + 1], dtype=float, device=device)
+    wp.launch(
+        mixed_dyn_static_loop_kernel,
+        num_threads,
+        inputs=[
+            dyn_a,
+            dyn_b,
+            dyn_c,
+        ],
+        outputs=[output],
+        device=device,
+    )
+    assert_np_equal(output.numpy(), np.ones([num_threads, const_a + const_b + dyn_a + dyn_b + dyn_c + 1]))
+
 
 @wp.kernel
 def test_call_syntax():

From ebfaa52596c9476deddbd6040fada31825bc22fb Mon Sep 17 00:00:00 2001
From: Eric Heiden <eheiden@nvidia.com>
Date: Mon, 7 Oct 2024 19:10:08 -0700
Subject: [PATCH 08/18] Improve error reporting in array.grad setter

---
 warp/types.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/warp/types.py b/warp/types.py
index b5ff61c75..e50b4cfd0 100644
--- a/warp/types.py
+++ b/warp/types.py
@@ -2261,13 +2261,22 @@ def grad(self, grad):
             self._requires_grad = False
         else:
             # make sure the given gradient array is compatible
-            if (
-                grad.dtype != self.dtype
-                or grad.shape != self.shape
-                or grad.strides != self.strides
-                or grad.device != self.device
-            ):
-                raise ValueError("The given gradient array is incompatible")
+            if grad.dtype != self.dtype:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected dtype {self.dtype}, got {grad.dtype}"
+                )
+            if grad.shape != self.shape:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected shape {self.shape}, got {grad.shape}"
+                )
+            if grad.device != self.device:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected device {self.device}, got {grad.device}"
+                )
+            if grad.strides != self.strides:
+                raise ValueError(
+                    f"The given gradient array is incompatible: expected strides {self.strides}, got {grad.strides}"
+                )
             self._grad = grad
             self._requires_grad = True
 

From cfe0437352efa25898d966b261ef602261be4754 Mon Sep 17 00:00:00 2001
From: Lukasz Wawrzyniak <lwawrzyniak@nvidia.com>
Date: Wed, 9 Oct 2024 17:59:01 -0400
Subject: [PATCH 09/18] Fix hashing of static expressions

---
 CHANGELOG.md              |   1 +
 warp/context.py           |  48 ++++++------
 warp/tests/test_static.py | 156 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 182 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6da251b54..79a870f29 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
 - Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes
 - Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`
 - Fix invalid code generation error messages when nesting dynamic and static for-loops
+- Fix caching of kernels with static expressions
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/warp/context.py b/warp/context.py
index 736f8c852..91fe2889f 100644
--- a/warp/context.py
+++ b/warp/context.py
@@ -6,7 +6,6 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 import ast
-import builtins
 import ctypes
 import functools
 import hashlib
@@ -22,7 +21,6 @@
 import weakref
 from copy import copy as shallowcopy
 from pathlib import Path
-from struct import pack as struct_pack
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 
 import numpy as np
@@ -1487,30 +1485,16 @@ def hash_adjoint(self, adj):
         # hash referenced constants
         for name, value in constants.items():
             ch.update(bytes(name, "utf-8"))
-            # hash the referenced object
-            if isinstance(value, builtins.bool):
-                # This needs to come before the check for `int` since all boolean
-                # values are also instances of `int`.
-                ch.update(struct_pack("?", value))
-            elif isinstance(value, int):
-                ch.update(struct_pack("<q", value))
-            elif isinstance(value, float):
-                ch.update(struct_pack("<d", value))
-            elif isinstance(value, warp.types.float16):
-                # float16 is a special case
-                p = ctypes.pointer(ctypes.c_float(value.value))
-                ch.update(p.contents)
-            elif isinstance(value, tuple(warp.types.scalar_types)):
-                p = ctypes.pointer(value._type_(value.value))
-                ch.update(p.contents)
-            elif isinstance(value, ctypes.Array):
-                ch.update(bytes(value))
-            else:
-                raise RuntimeError(f"Invalid constant type: {type(value)}")
+            ch.update(self.get_constant_bytes(value))
 
         # hash wp.static() expressions that were evaluated at declaration time
         for k, v in adj.static_expressions.items():
-            ch.update(bytes(f"{k} = {v}", "utf-8"))
+            ch.update(bytes(k, "utf-8"))
+            if isinstance(v, Function):
+                if v not in self.functions_in_progress:
+                    ch.update(self.hash_function(v))
+            else:
+                ch.update(self.get_constant_bytes(v))
 
         # hash referenced types
         for t in types.keys():
@@ -1523,6 +1507,24 @@ def hash_adjoint(self, adj):
 
         return ch.digest()
 
+    def get_constant_bytes(self, value):
+        if isinstance(value, int):
+            # this also handles builtins.bool
+            return bytes(ctypes.c_int(value))
+        elif isinstance(value, float):
+            return bytes(ctypes.c_float(value))
+        elif isinstance(value, warp.types.float16):
+            # float16 is a special case
+            return bytes(ctypes.c_float(value.value))
+        elif isinstance(value, tuple(warp.types.scalar_and_bool_types)):
+            return bytes(value._type_(value.value))
+        elif hasattr(value, "_wp_scalar_type_"):
+            return bytes(value)
+        elif isinstance(value, warp.codegen.StructInstance):
+            return bytes(value._ctype)
+        else:
+            raise TypeError(f"Invalid constant type: {type(value)}")
+
     def get_module_hash(self):
         return self.module_hash
 
diff --git a/warp/tests/test_static.py b/warp/tests/test_static.py
index 9bae1e526..9e3f73931 100644
--- a/warp/tests/test_static.py
+++ b/warp/tests/test_static.py
@@ -5,6 +5,8 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
+import importlib
+import tempfile
 import unittest
 from typing import Dict, List
 
@@ -17,6 +19,23 @@
 global_variable = 3
 
 
+def load_code_as_module(code, name):
+    file, file_path = tempfile.mkstemp(suffix=".py")
+
+    try:
+        with os.fdopen(file, "w") as f:
+            f.write(code)
+
+        spec = importlib.util.spec_from_file_location(name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+    finally:
+        os.remove(file_path)
+
+    # return Warp module
+    return wp.get_module(module.__name__)
+
+
 @wp.func
 def static_global_variable_func():
     static_var = warp.static(global_variable + 2)
@@ -383,6 +402,140 @@ def static_condition3(results: wp.array(dtype=int)):
         assert_np_equal(counts["else"], 0)
 
 
+static_builtin_constant_template = """
+import warp as wp
+
+# Python builtin literal like 17, 42.0, or True
+C = {value}
+
+@wp.kernel
+def k():
+    print(wp.static(C))
+"""
+
+static_warp_constant_template = """
+import warp as wp
+
+# Warp scalar value like wp.uint8(17)
+C = wp.{dtype}({value})
+
+@wp.kernel
+def k():
+    print(wp.static(C))
+"""
+
+static_struct_constant_template = """
+import warp as wp
+
+@wp.struct
+class SimpleStruct:
+    x: float
+
+C = SimpleStruct()
+C.x = {value}
+
+@wp.kernel
+def k():
+    print(wp.static(C))
+"""
+
+static_func_template = """
+import warp as wp
+
+@wp.func
+def f():
+    # modify the function to verify hashing
+    return {value}
+
+@wp.kernel
+def k():
+    print(wp.static(f)())
+"""
+
+
+def test_static_constant_hash(test, _):
+    # Python literals
+    # (type, value1, value2)
+    literals = [
+        (int, 17, 42),
+        (float, 17.5, 42.5),
+        (bool, True, False),
+    ]
+
+    for builtin_type, value1, value2 in literals:
+        type_name = builtin_type.__name__
+        with test.subTest(msg=f"{type_name}"):
+            source1 = static_builtin_constant_template.format(value=value1)
+            source2 = static_builtin_constant_template.format(value=value2)
+            source3 = static_builtin_constant_template.format(value=value1)
+
+            module1 = load_code_as_module(source1, f"aux_static_constant_builtin_{type_name}_1")
+            module2 = load_code_as_module(source2, f"aux_static_constant_builtin_{type_name}_2")
+            module3 = load_code_as_module(source3, f"aux_static_constant_builtin_{type_name}_3")
+
+            hash1 = module1.hash_module()
+            hash2 = module2.hash_module()
+            hash3 = module3.hash_module()
+
+            test.assertNotEqual(hash1, hash2)
+            test.assertEqual(hash1, hash3)
+
+    # Warp types (scalars, vectors, matrices)
+    for warp_type in [*wp.types.scalar_types, *wp.types.vector_types]:
+        type_name = warp_type.__name__
+        with test.subTest(msg=f"wp.{type_name}"):
+            value1 = ", ".join([str(17)] * warp_type._length_)
+            value2 = ", ".join([str(42)] * warp_type._length_)
+            source1 = static_warp_constant_template.format(dtype=type_name, value=value1)
+            source2 = static_warp_constant_template.format(dtype=type_name, value=value2)
+            source3 = static_warp_constant_template.format(dtype=type_name, value=value1)
+
+            module1 = load_code_as_module(source1, f"aux_static_constant_wp_{type_name}_1")
+            module2 = load_code_as_module(source2, f"aux_static_constant_wp_{type_name}_2")
+            module3 = load_code_as_module(source3, f"aux_static_constant_wp_{type_name}_3")
+
+            hash1 = module1.hash_module()
+            hash2 = module2.hash_module()
+            hash3 = module3.hash_module()
+
+            test.assertNotEqual(hash1, hash2)
+            test.assertEqual(hash1, hash3)
+
+    # structs
+    with test.subTest(msg="struct"):
+        source1 = static_struct_constant_template.format(value=17)
+        source2 = static_struct_constant_template.format(value=42)
+        source3 = static_struct_constant_template.format(value=17)
+
+        module1 = load_code_as_module(source1, "aux_static_constant_struct_1")
+        module2 = load_code_as_module(source2, "aux_static_constant_struct_2")
+        module3 = load_code_as_module(source3, "aux_static_constant_struct_3")
+
+        hash1 = module1.hash_module()
+        hash2 = module2.hash_module()
+        hash3 = module3.hash_module()
+
+        test.assertNotEqual(hash1, hash2)
+        test.assertEqual(hash1, hash3)
+
+
+def test_static_function_hash(test, _):
+    source1 = static_func_template.format(value=17)
+    source2 = static_func_template.format(value=42)
+    source3 = static_func_template.format(value=17)
+
+    module1 = load_code_as_module(source1, "aux_static_func1")
+    module2 = load_code_as_module(source2, "aux_static_func2")
+    module3 = load_code_as_module(source3, "aux_static_func3")
+
+    hash1 = module1.hash_module()
+    hash2 = module2.hash_module()
+    hash3 = module3.hash_module()
+
+    test.assertNotEqual(hash1, hash2)
+    test.assertEqual(hash1, hash3)
+
+
 devices = get_test_devices()
 
 
@@ -406,6 +559,9 @@ def test_static_python_call(self):
 add_function_test(TestStatic, "test_static_for_loop", test_static_for_loop, devices=devices)
 add_function_test(TestStatic, "test_static_if_else_elif", test_static_if_else_elif, devices=devices)
 
+add_function_test(TestStatic, "test_static_constant_hash", test_static_constant_hash, devices=None)
+add_function_test(TestStatic, "test_static_function_hash", test_static_function_hash, devices=None)
+
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()

From 286f380830cbfe83577a1d1828dbc17dac7a8f82 Mon Sep 17 00:00:00 2001
From: Kristijan Bartol <kristijan.bartol@gmail.com>
Date: Wed, 9 Oct 2024 13:57:30 +0200
Subject: [PATCH 10/18] Remove redundant if block in the XPBD simulator.

---
 warp/sim/integrator_xpbd.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/warp/sim/integrator_xpbd.py b/warp/sim/integrator_xpbd.py
index ef585c296..d8d1d8548 100644
--- a/warp/sim/integrator_xpbd.py
+++ b/warp/sim/integrator_xpbd.py
@@ -2808,12 +2808,8 @@ def simulate(self, model: Model, state_in: State, state_out: State, dt: float, c
 
         with wp.ScopedTimer("simulate", False):
             if model.particle_count:
-                if requires_grad:
-                    particle_q = state_out.particle_q
-                    particle_qd = state_out.particle_qd
-                else:
-                    particle_q = state_out.particle_q
-                    particle_qd = state_out.particle_qd
+                particle_q = state_out.particle_q
+                particle_qd = state_out.particle_qd
 
                 self.particle_q_init = wp.clone(state_in.particle_q)
                 if self.enable_restitution:

From 50fca0bfd2ddb8109e674c98ddff5b4cb4cd059b Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <mehdi.ataei@autodesk.com>
Date: Thu, 10 Oct 2024 12:42:33 -0400
Subject: [PATCH 11/18] Added advanced optimization example using static loop
 unrolling

---
 docs/codegen.rst | 148 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 147 insertions(+), 1 deletion(-)

diff --git a/docs/codegen.rst b/docs/codegen.rst
index fe5ed81bb..b4984781c 100644
--- a/docs/codegen.rst
+++ b/docs/codegen.rst
@@ -446,6 +446,153 @@ The above program uses a static expression to select the right function given th
     [2. 0.]
 
 
+Advanced Example: Branching Elimination with Static Loop Unrolling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In computational simulations, it's common to apply different operations or boundary conditions based on runtime variables. However, conditional branching using runtime variables often leads to performance issues due to register pressure, as the GPU may allocate resources for all branches even if some of them are never taken. To tackle this, we can utilize static loop unrolling via ``wp.static(...)``, which helps eliminate unnecessary branching at compile-time and improve parallel execution.
+
+**Scenario:**
+
+Suppose we have three different functions ``apply_func_a``, ``apply_func_b``, and ``apply_func_c`` that perform different mathematical operations.
+
+We are currently interested in applying only two of these functions (``apply_func_a`` and ``apply_func_b``) on a given dataset. Which function we apply to each data point is determined by a runtime variable ``func_id``, which is provided as an array to the kernel called ``func_field``.
+
+In practice, ``func_field`` represents a mapping of which operation should be applied to each data point, and is particularly useful when dealing with boundary conditions or different regions of a physical simulation. For example, in a fluid simulation, different regions of the fluid might require different updates based on pre-defined boundary conditions.
+
+**Naive Approach Implementation**
+
+To start, let us first consider a naive approach to implement this, which involves straightforward runtime branching based on the value of func_id. This approach will highlight why we need to optimize further.
+
+.. code:: python
+
+    import warp as wp
+    import numpy as np
+
+    # Define three functions that perform different operations
+    @wp.func
+    def apply_func_a(x: float) -> float:
+        return x + 10.0
+
+    @wp.func
+    def apply_func_b(x: float) -> float:
+        return x * 2.0
+
+    @wp.func
+    def apply_func_c(x: float) -> float:
+        return x - 5.0
+
+    # Assign static IDs to represent each function
+    func_id_a = 0
+    func_id_b = 1
+    func_id_c = 2  # Not used in this kernel
+
+    # Kernel that applies the correct function to each element of the input array
+    @wp.kernel
+    def apply_func_conditions_naive(x: wp.array(dtype=wp.float32), func_field: wp.array(dtype=wp.int8)):
+        tid = wp.tid()
+        value = x[tid]
+        result = value
+        func_id = func_field[tid]  # Get the function ID for this element
+
+        # Apply the corresponding function based on func_id
+        if func_id == func_id_a:
+            result = apply_func_a(value)
+        elif func_id == func_id_b:
+            result = apply_func_b(value)
+        elif func_id == func_id_c:
+            result = apply_func_c(value)
+
+        x[tid] = result
+
+    # Example usage
+    data = wp.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=wp.float32)
+
+    # Create an array that specifies which function to apply to each element
+    func_field = wp.array([func_id_a, func_id_b, func_id_b, func_id_a, func_id_b], dtype=wp.int8)
+
+    # Launch the kernel
+    wp.launch(apply_func_conditions_naive, inputs=[data, func_field], dim=data.size)
+
+    print(data.numpy())
+
+**Output:**
+
+.. code:: python
+
+    [11.  4.  6. 14. 10.]
+
+Since ``func_id`` is not static, the compiler cannot eliminate the unused function at compile time. Looking at the generated CUDA code, we can see the kernel includes an extra branching for the unused ``apply_func_c``:
+
+.. code:: cpp
+
+    //...
+    var_11 = wp::select(var_9, var_4, var_10);
+    if (!var_9) {
+        var_13 = (var_7 == var_12);
+        if (var_13) {
+            var_14 = apply_func_b_0(var_3);
+        }
+        var_15 = wp::select(var_13, var_11, var_14);
+        if (!var_13) {
+            var_17 = (var_7 == var_16);
+            if (var_17) {
+                var_18 = apply_func_c_0(var_3);
+            }
+            var_19 = wp::select(var_17, var_15, var_18);
+        }
+        var_20 = wp::select(var_13, var_19, var_15);
+    }
+    //...
+
+**Optimization**
+
+To avoid the extra branching, we can use the static loop unrolling via ``wp.static(...)`` to effectively "compile out" the unnecessary branches and only keep the operations that are relevant.
+
+**Implementation:**
+
+.. code:: python
+
+    funcs = [apply_func_a, apply_func_b, apply_func_c]
+
+    # Assign static IDs to represent each function
+    func_id_a = 0
+    func_id_b = 1
+    func_id_c = 2  # Not used in this kernel
+
+    # Define which function IDs are actually used in this kernel
+    used_func_ids = (func_id_a, func_id_b)
+
+    @wp.kernel
+    def apply_func_conditions(x: wp.array(dtype=wp.float32), func_field: wp.array(dtype=wp.int8)):
+        tid = wp.tid()
+        value = x[tid]
+        result = value
+        func_id = func_field[tid]  # Get the function ID for this element
+
+        # Unroll the loop over the used function IDs
+        for i in range(wp.static(len(used_func_ids))):
+            func_static_id = wp.static(used_func_ids[i])
+            if func_id == func_static_id:
+                result = wp.static(funcs[i])(value)
+
+        x[tid] = result
+
+
+In the generated CUDA code, we can see that the optimized code does not branch for the unused function.
+
+.. code:: cpp
+    
+    //...
+    var_10 = (var_7 == var_9);
+    if (var_10) {
+        var_11 = apply_func_a_1(var_3);
+    }
+    var_12 = wp::select(var_10, var_4, var_11);
+    var_15 = (var_7 == var_14);
+    if (var_15) {
+        var_16 = apply_func_b_1(var_3);
+    }
+    //...
+
 .. _dynamic_generation:
 
 Dynamic Kernel Creation
@@ -566,7 +713,6 @@ Output:
     [ 1.  4.  9.  16.  25.]
     [ 1.  8.  27.  64.  125.]
 
-
 Function Closures
 ~~~~~~~~~~~~~~~~~
 

From 4a7024f303bafe031ea6023a4d02d85ba346b8f8 Mon Sep 17 00:00:00 2001
From: Eric Heiden <eheiden@nvidia.com>
Date: Thu, 10 Oct 2024 20:02:51 -0700
Subject: [PATCH 12/18] Fix ModelBuilder.add_builder for multiple articulations

---
 CHANGELOG.md             |  1 +
 warp/sim/model.py        |  3 +--
 warp/tests/test_model.py | 13 +++++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 79a870f29..634cadfae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@
 - Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`
 - Fix invalid code generation error messages when nesting dynamic and static for-loops
 - Fix caching of kernels with static expressions
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/warp/sim/model.py b/warp/sim/model.py
index 6bd175bc3..98a055dd5 100644
--- a/warp/sim/model.py
+++ b/warp/sim/model.py
@@ -1404,9 +1404,8 @@ def add_builder(self, builder, xform=None, update_num_env_count=True, separate_c
             self.joint_X_p.extend(joint_X_p)
             self.joint_q.extend(joint_q)
 
-            self.add_articulation()
-
             # offset the indices
+            self.articulation_start.extend([a + self.joint_count for a in builder.articulation_start])
             self.joint_parent.extend([p + self.joint_count if p != -1 else -1 for p in builder.joint_parent])
             self.joint_child.extend([c + self.joint_count for c in builder.joint_child])
 
diff --git a/warp/tests/test_model.py b/warp/tests/test_model.py
index dde81889b..da872a6cb 100644
--- a/warp/tests/test_model.py
+++ b/warp/tests/test_model.py
@@ -157,6 +157,19 @@ def add_three_cubes(builder: ModelBuilder, parent_body=-1):
         assert builder.body_mass == [1.0, 4.0]
         assert builder.body_inv_mass == [1.0, 0.25]
 
+        # create another builder, test add_builder function
+        builder2 = ModelBuilder()
+        builder2.add_builder(builder)
+        assert builder2.articulation_count == builder.articulation_count
+        assert builder2.joint_count == builder.joint_count
+        assert builder2.body_count == builder.body_count
+        assert builder2.shape_count == builder.shape_count
+        assert builder2.articulation_start == builder.articulation_start
+        # add the same builder again
+        builder2.add_builder(builder)
+        assert builder2.articulation_count == 2 * builder.articulation_count
+        assert builder2.articulation_start == [0, 1, 2, 3]
+
 
 if __name__ == "__main__":
     wp.clear_kernel_cache()

From 47eac851dba9c7a105b3b5a0f87111a9b9d3f85d Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 14 Oct 2024 09:52:30 -0700
Subject: [PATCH 13/18] Use NumPy 2.0 on Windows for CI

---
 .gitlab-ci.yml                        | 8 ++------
 .gitlab/ci/additional-tests.yml       | 6 +-----
 .gitlab/ci/cuda-11-build-and-test.yml | 8 ++------
 .gitlab/ci/debug-build-and-test.yml   | 6 +-----
 4 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ff01e7049..04e391b79 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -65,7 +65,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --no-docker # We are already using the builder image
+    - ./tools/ci/building/build-linux-aarch64/build.sh --no-docker # We are already using the builder image
     - mkdir -p warp/bin/linux-aarch64
     - mv warp/bin/warp.so warp/bin/linux-aarch64
     - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
@@ -338,11 +338,7 @@ windows-x86_64 test mgpu:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
diff --git a/.gitlab/ci/additional-tests.yml b/.gitlab/ci/additional-tests.yml
index 10c198899..6b6f92028 100644
--- a/.gitlab/ci/additional-tests.yml
+++ b/.gitlab/ci/additional-tests.yml
@@ -67,11 +67,7 @@ windows-x86_64 test:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
diff --git a/.gitlab/ci/cuda-11-build-and-test.yml b/.gitlab/ci/cuda-11-build-and-test.yml
index 507cbbfd5..675b245e8 100644
--- a/.gitlab/ci/cuda-11-build-and-test.yml
+++ b/.gitlab/ci/cuda-11-build-and-test.yml
@@ -53,7 +53,7 @@ linux-aarch64 build:
     - apt-get update && apt-get install build-essential curl --no-install-recommends -y
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
   script:
-    - ./tools/ci/building/build-linux-x86_64/build.sh --cuda 11 --no-docker # We are already using the builder image
+    - ./tools/ci/building/build-linux-aarch64/build.sh --cuda 11 --no-docker # We are already using the builder image
     - mkdir -p warp/bin/linux-aarch64
     - mv warp/bin/warp.so warp/bin/linux-aarch64
     - mv warp/bin/warp-clang.so warp/bin/linux-aarch64
@@ -145,11 +145,7 @@ windows-x86_64 test:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"
diff --git a/.gitlab/ci/debug-build-and-test.yml b/.gitlab/ci/debug-build-and-test.yml
index e86f553d1..b468aa079 100644
--- a/.gitlab/ci/debug-build-and-test.yml
+++ b/.gitlab/ci/debug-build-and-test.yml
@@ -136,11 +136,7 @@ windows-x86_64 test:
     - tools/packman/packman.cmd install -l _build/target-deps/python python $python_name
     - '& $env:CI_PROJECT_DIR\_build\target-deps\python\python.exe -m venv _venv'
     - .\_venv\Scripts\Activate.ps1
-    - python -m pip install --upgrade pip
-    - python -m pip install --upgrade usd-core
-    # Temporary HACK: use NumPy < 2.0 on Windows due to issues with Torch wheels that are not compatible
-    # https://github.com/pytorch/pytorch/issues/128860
-    - python -m pip install "numpy<2"
+    - python -m pip install --upgrade pip usd-core numpy
     - python -m pip install --upgrade torch --extra-index-url https://download.pytorch.org/whl/cu121
     - python -m pip install -e .
     - Write-Output "$([char]27)[0Ksection_end:$(GetTime):install_dependencies$([char]13)$([char]27)[0K"

From c17409142eb9a58b1ea80cce5a9efd6a8d1d6688 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 14 Oct 2024 12:00:11 -0700
Subject: [PATCH 14/18] Add changelog page to the Sphinx docs

---
 CHANGELOG.md                          | 110 ++++++++++++-------------
 docs/changelog.md                     |   8 ++
 docs/conf.py                          |   6 ++
 docs/index.rst                        |   1 +
 docs/requirements.txt                 |   1 +
 exts/omni.warp.core/docs/CHANGELOG.md | 113 +++++++++++++-------------
 exts/omni.warp/docs/CHANGELOG.md      | 113 +++++++++++++-------------
 7 files changed, 183 insertions(+), 169 deletions(-)
 create mode 100644 docs/changelog.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 634cadfae..f4ad172cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-# CHANGELOG
+# Changelog
 
 ## [Unreleased] - 2024-??
 
@@ -220,7 +220,7 @@
 - Fix for handling of `bool` types in generic kernels
 - Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
-## [1.1.1] - 2024-05-24
+## 1.1.1 - 2024-05-24
 
 - `wp.init()` is no longer required to be called explicitly and will be performed on first call to the API
 - Speed up `omni.warp.core`'s startup time
@@ -255,7 +255,7 @@
 - Support gradient propagation for indexing sliced multi-dimensional arrays, i.e. `a[i][j]` vs. `a[i, j]`
 - Provide an informative message if setting DLL C-types failed, instructing to try rebuilding the library
 
-## [1.0.3] - 2024-04-17
+## 1.0.3 - 2024-04-17
 
 - Add a `support_level` entry to the configuration file of the extensions
 
@@ -333,7 +333,7 @@
 - Added `wp.ones()` to efficiently create one-initialized arrays
 - Rename `wp.config.graph_capture_module_load_default` to `wp.config.enable_graph_capture_module_load_by_default`
 
-## [0.14.0] - 2024-02-19
+## 0.14.0 - 2024-02-19
 
 - Add support for CUDA pooled (stream-ordered) allocators
   - Support memory allocation during graph capture
@@ -370,7 +370,7 @@
   - Fixed a small CPU memory leak related to DLPack interop
 - Improved performance of creating arrays
 
-## [0.13.1] - 2024-02-22
+## 0.13.1 - 2024-02-22
 
 - Ensure that the results from the `Noise Deform` are deterministic across different Kit sessions
 
@@ -383,7 +383,7 @@
 - Add missing `.py` extension to `warp/tests/walkthrough_debug`
 - Allow `wp.bool` usage in vector and matrix types
 
-## [0.12.0] - 2024-02-05
+## 0.12.0 - 2024-02-05
 
 - Add a warning when the `enable_backward` setting is set to `False` upon calling `wp.Tape.backward()`
 - Fix kernels not being recompiled as expected when defined using a closure
@@ -399,7 +399,7 @@
   - Point releases (if any) go on the same minor release branch and only contain bug fixes, not new features.
   - The `public` branch, previously used to merge releases into and corresponding with the GitHub `main` branch, is retired.
 
-## [1.0.0-beta.7] - 2024-01-23
+## 1.0.0-beta.7 - 2024-01-23
 
 - Ensure captures are always enclosed in `try`/`finally`
 - Only include .py files from the warp subdirectory into wheel packages
@@ -461,7 +461,7 @@
 - Documentation update for `wp.BVH`
 - Documentation and simplified API for runtime kernel specialization `wp.Kernel`
 
-## [1.0.0-beta.4] - 2023-11-01
+## 1.0.0-beta.4 - 2023-11-01
 
 - Add `wp.cbrt()` for cube root calculation
 - Add `wp.mesh_furthest_point_no_sign()` to compute furthest point on a surface from a query point
@@ -473,7 +473,7 @@
 - Fix for `wp.utils.array_sum()` output initialization when used with vector types
 - Coverage and documentation updates
 
-## [1.0.0-beta.3] - 2023-10-19
+## 1.0.0-beta.3 - 2023-10-19
 
 - Add support for code coverage scans (test_coverage.py), coverage at 85% in `omni.warp.core`
 - Add support for named component access for vector types, e.g.: `a = v.x`
@@ -495,13 +495,13 @@
 
 - To support grid-stride kernels, `wp.tid()` can no longer be called inside `wp.func` functions.
 
-## [1.0.0-beta.2] - 2023-09-01
+## 1.0.0-beta.2 - 2023-09-01
 
 - Fix for passing bool into `wp.func` functions
 - Fix for deprecation warnings appearing on `stderr`, now redirected to `stdout`
 - Fix for using `for i in wp.hash_grid_query(..)` syntax
 
-## [1.0.0-beta.1] - 2023-08-29
+## 1.0.0-beta.1 - 2023-08-29
 
 - Fix for `wp.float16` being passed as kernel arguments
 - Fix for compile errors with kernels using structs in backward pass
@@ -540,7 +540,7 @@
 - Update margin used by for mesh queries when using `wp.sim.create_soft_body_contacts()`
 - Improvements to gradient handling with `wp.from_torch()`, `wp.to_torch()` plus documentation
 
-## [0.10.0] - 2023-07-05
+## 0.10.0 - 2023-07-05
 
 - Add support for macOS universal binaries (x86 + aarch64) for M1+ support
 - Add additional methods for SDF generation please see the following new methods:
@@ -616,7 +616,7 @@
 - Deprecate `wp.Model.soft_contact_distance` which is now replaced by `wp.Model.particle_radius`
 - Deprecate single scalar particle radius (should be a per-particle array)
 
-## [0.8.2] - 2023-04-21
+## 0.8.2 - 2023-04-21
 
 - Add `ModelBuilder.soft_contact_max` to control the maximum number of soft contacts that can be registered. Use `Model.allocate_soft_contacts(new_count)` to change count on existing `Model` objects.
 - Add support for `bool` parameters
@@ -627,12 +627,12 @@
 - Add sign determination using winding number of `wp.mesh_query_point()` as `wp.mesh_query_sign_winding_number()`
 - Add query point without sign determination `wp.mesh_query_no_sign()`
 
-## [0.8.1] - 2023-04-13
+## 0.8.1 - 2023-04-13
 
 - Fix for regression when passing flattened numeric lists as matrix arguments to kernels
 - Fix for regressions when passing `wp.struct` types with uninitialized (`None`) member attributes
 
-## [0.8.0] - 2023-04-05
+## 0.8.0 - 2023-04-05
 
 - Add `Texture Write` node for updating dynamic RTX textures from Warp kernels / nodes
 - Add multi-dimensional kernel support to Warp Kernel Node
@@ -676,14 +676,14 @@
 - `wp.sim.model.ground_plane` is now a `wp.array` to support gradient, users should call `builder.set_ground_plane()` to create the ground 
 - `wp.sim` capsule, cones, and cylinders are now aligned with the default USD up-axis
 
-## [0.7.2] - 2023-02-15
+## 0.7.2 - 2023-02-15
 
 - Reduce test time for vec/math types
 - Clean-up CUDA disabled build pipeline
 - Remove extension.gen.toml to make Kit packages Python version independent
 - Handle additional cases for array indexing inside Python
 
-## [0.7.1] - 2023-02-14
+## 0.7.1 - 2023-02-14
 
 - Disabling some slow tests for Kit
 - Make unit tests run on first GPU only by default
@@ -700,13 +700,13 @@
 - Add security pop-up for Kernel Node
 - Improve error handling for kernel return values
 
-## [0.6.3] - 2023-01-31
+## 0.6.3 - 2023-01-31
 
 - Add DLPack utilities, see `wp.from_dlpack()`, `wp.to_dlpack()`
 - Add Jax utilities, see `wp.from_jax()`, `wp.to_jax()`, `wp.device_from_jax()`, `wp.device_to_jax()`
 - Fix for Linux Kit extensions OM-80132, OM-80133
 
-## [0.6.2] - 2023-01-19
+## 0.6.2 - 2023-01-19
 
 - Updated `wp.from_torch()` to support more data types
 - Updated `wp.from_torch()` to automatically determine the target Warp data type if not specified
@@ -721,14 +721,14 @@
 - Replace Python `imp` package with `importlib`
 - Fix for quaternion slerp gradients (`wp.quat_slerp()`)
 
-## [0.6.1] - 2022-12-05
+## 0.6.1 - 2022-12-05
 
 - Fix for non-CUDA builds
 - Fix strides computation in array_t constructor, fixes a bug with accessing mesh indices through mesh.indices[]
 - Disable backward pass code generation for kernel node (4-6x faster compilation)
 - Switch to linbuild for universal Linux binaries (affects TeamCity builds only)
 
-## [0.6.0] - 2022-11-28
+## 0.6.0 - 2022-11-28
 
 - Add support for CUDA streams, see `wp.Stream`, `wp.get_stream()`, `wp.set_stream()`, `wp.synchronize_stream()`, `wp.ScopedStream`
 - Add support for CUDA events, see `wp.Event`, `wp.record_event()`, `wp.wait_event()`, `wp.wait_stream()`, `wp.Stream.record_event()`, `wp.Stream.wait_event()`, `wp.Stream.wait_stream()`
@@ -753,7 +753,7 @@
 - Fix various deployment issues by statically linking with all CUDA libs
 - Update warp.so/warp.dll to CUDA Toolkit 11.5
 
-## [0.5.1] - 2022-11-01
+## 0.5.1 - 2022-11-01
 
 - Fix for unit tests in Kit
 
@@ -790,14 +790,14 @@
 - Fix for arrays > 2GB in length
 - Add support for per-vertex USD mesh colors with `wp.render` class
 
-## [0.4.2] - 2022-09-07
+## 0.4.2 - 2022-09-07
 
 - Register Warp samples to the sample browser in Kit
 - Add NDEBUG flag to release mode kernel builds
 - Fix for particle solver node when using a large number of particles
 - Fix for broken cameras in Warp sample scenes
 
-## [0.4.1] - 2022-08-30
+## 0.4.1 - 2022-08-30
 
 - Add geometry sampling methods, see `wp.sample_unit_cube()`, `wp.sample_unit_disk()`, etc
 - Add `wp.lower_bound()` for searching sorted arrays
@@ -807,7 +807,7 @@
 - Fix for debug flags not being set correctly on CUDA when `wp.config.mode == "debug"`, this enables bounds checking on CUDA kernels in debug mode
 - Fix for code gen of functions that do not return a value
 
-## [0.4.0] - 2022-08-09
+## 0.4.0 - 2022-08-09
 
 - Fix for FP16 conversions on GPUs without hardware support
 - Fix for `runtime = None` errors when reloading the Warp module
@@ -824,7 +824,7 @@
 
 - Removed `wp.runtime` reference from the top-level module, as it should be considered private
 
-## [0.3.2] - 2022-07-19
+## 0.3.2 - 2022-07-19
 
 - Remove Torch import from `__init__.py`, defer import to `wp.from_torch()`, `wp.to_torch()`
 
@@ -846,7 +846,7 @@
 - `wp.synchronize()` now synchronizes all devices; for finer-grained control, use `wp.synchronize_device()`
 - Device alias `"cuda"` now refers to the current CUDA context, rather than a specific device like `"cuda:0"` or `"cuda:1"`
 
-## [0.3.0] - 2022-07-08
+## 0.3.0 - 2022-07-08
 
 - Add support for FP16 storage type, see `wp.float16`
 - Add support for per-dimension byte strides, see `wp.array.strides`
@@ -883,7 +883,7 @@
 - Tape `capture` option has been removed, users can now capture tapes inside existing CUDA graphs (e.g.: inside Torch)
 - Scalar loss arrays should now explicitly set `requires_grad=True` at creation time
 
-## [0.2.2] - 2022-05-30
+## 0.2.2 - 2022-05-30
 
 - Fix for `from import *` inside Warp initialization
 - Fix for body space velocity when using deforming Mesh objects with scale
@@ -907,7 +907,7 @@
 - Local `@wp.func` functions should not be namespaced when called, e.g.: previously `wp.myfunc()` would work even if `myfunc()` was not a builtin
 - Removed `wp.rpy2quat()`, please use `wp.quat_rpy()` instead
 
-## [0.2.1] - 2022-05-11
+## 0.2.1 - 2022-05-11
 
 - Fix for unit tests in Kit
 
@@ -956,7 +956,7 @@
 - `wp.array.length` member has been removed, please use `wp.array.shape` to access array dimensions, or use `wp.array.size` to get total element count
 - Marking `dense_gemm()`, `dense_chol()`, etc methods as experimental until we revisit them
 
-## [0.1.25] - 2022-03-20
+## 0.1.25 - 2022-03-20
 
 - Add support for class methods to be Warp kernels
 - Add HashGrid reserve() so it can be used with CUDA graphs
@@ -966,7 +966,7 @@
 - Add support for floored division on integer types
 - Move tests into core library so they can be run in Kit environment
 
-## [0.1.24] - 2022-03-03
+## 0.1.24 - 2022-03-03
 
 ### Warp Core
 
@@ -982,7 +982,7 @@
 - Fix for ranged for loops with negative step sizes
 - Fix for 3d and 4d spherical gradient distributions
 
-## [0.1.23] - 2022-02-17
+## 0.1.23 - 2022-02-17
 
 ### Warp Core
 
@@ -992,7 +992,7 @@
 - Add procedural noise primitives, see `wp.noise()`, `wp.pnoise()`, `wp.curlnoise()`
 - Move simulation helpers our of test into `wp.sim` module
 
-## [0.1.22] - 2022-02-14
+## 0.1.22 - 2022-02-14
 
 ### Warp Core
 
@@ -1006,7 +1006,7 @@
 
 - Add support for universal and compound joint types
 
-## [0.1.21] - 2022-01-19
+## 0.1.21 - 2022-01-19
 
 ### Warp Core
 
@@ -1026,19 +1026,19 @@
 - New OgnParticleVolume node for sampling shapes -> particles
 - New OgnParticleSolver node for DEM style granular materials
 
-## [0.1.20] - 2021-11-02
+## 0.1.20 - 2021-11-02
 
 - Updates to the ripple solver for GTC (support for multiple colliders, buoyancy, etc)
 
-## [0.1.19] - 2021-10-15
+## 0.1.19 - 2021-10-15
 
 - Publish from 2021.3 to avoid omni.graph database incompatibilities
 
-## [0.1.18] - 2021-10-08
+## 0.1.18 - 2021-10-08
 
 - Enable Linux support (tested on 20.04)
 
-## [0.1.17] - 2021-09-30
+## 0.1.17 - 2021-09-30
 
 - Fix for 3x3 SVD adjoint
 - Fix for A6000 GPU (bump compute model to sm_52 minimum)
@@ -1047,12 +1047,12 @@
 - Rename spatial_transform -> transform
 - Documentation update
 
-## [0.1.16] - 2021-09-06
+## 0.1.16 - 2021-09-06
 
 - Fix for case where simple assignments (a = b) incorrectly generated reference rather than value copy
 - Handle passing zero-length (empty) arrays to kernels
 
-## [0.1.15] - 2021-09-03
+## 0.1.15 - 2021-09-03
 
 - Add additional math library functions (asin, etc)
 - Add builtin 3x3 SVD support
@@ -1065,62 +1065,62 @@
 - Removes the need to transfer array to CPU before numpy conversion (will be done implicitly)
 - Update the example OgnRipple wave equation solver to use bundles
 
-## [0.1.14] - 2021-08-09
+## 0.1.14 - 2021-08-09
 
 - Fix for out-of-bounds memory access in CUDA BVH
 - Better error checking after kernel launches (use `wp.config.verify_cuda=True`)
 - Fix for vec3 normalize adjoint code
 
-## [0.1.13] - 2021-07-29
+## 0.1.13 - 2021-07-29
 
 - Remove OgnShrinkWrap.py test node
 
-## [0.1.12] - 2021-07-29
+## 0.1.12 - 2021-07-29
 
 - Switch to Woop et al.'s watertight ray-tri intersection test
 - Disable --fast-math in CUDA compilation step for improved precision
 
-## [0.1.11] - 2021-07-28
+## 0.1.11 - 2021-07-28
 
 - Fix for `wp.mesh_query_ray()` returning incorrect t-value
 
-## [0.1.10] - 2021-07-28
+## 0.1.10 - 2021-07-28
 
 - Fix for OV extension fwatcher filters to avoid hot-reload loop due to OGN regeneration
 
-## [0.1.9] - 2021-07-21
+## 0.1.9 - 2021-07-21
 
 - Fix for loading sibling DLL paths
 - Better type checking for built-in function arguments
 - Added runtime docs, can now list all builtins using `wp.print_builtins()`
 
-## [0.1.8] - 2021-07-14
+## 0.1.8 - 2021-07-14
 
 - Fix for hot-reload of CUDA kernels
 - Add Tape object for replaying differentiable kernels
 - Add helpers for Torch interop (convert `torch.Tensor` to `wp.Array`)
 
-## [0.1.7] - 2021-07-05
+## 0.1.7 - 2021-07-05
 
 - Switch to NVRTC for CUDA runtime
 - Allow running without host compiler
 - Disable asserts in kernel release mode (small perf. improvement)
 
-## [0.1.6] - 2021-06-14
+## 0.1.6 - 2021-06-14
 
 - Look for CUDA toolchain in target-deps
 
-## [0.1.5] - 2021-06-14
+## 0.1.5 - 2021-06-14
 
 - Rename OgLang -> Warp
 - Improve CUDA environment error checking
 - Clean-up some logging, add verbose mode (`wp.config.verbose`)
 
-## [0.1.4] - 2021-06-10
+## 0.1.4 - 2021-06-10
 
 - Add support for mesh raycast
 
-## [0.1.3] - 2021-06-09
+## 0.1.3 - 2021-06-09
 
 - Add support for unary negation operator
 - Add support for mutating variables during dynamic loops (non-differentiable)
@@ -1128,7 +1128,7 @@
 - Improve kernel cache start up times (avoids adjointing before cache check)
 - Update README.md with requirements / examples
 
-## [0.1.2] - 2021-06-03
+## 0.1.2 - 2021-06-03
 
 - Add support for querying mesh velocities
 - Add CUDA graph support, see `wp.capture_begin()`, `wp.capture_end()`, `wp.capture_launch()`
@@ -1138,11 +1138,11 @@
 
 - Fix for Linux/macOS support
 
-## [0.1.1] - 2021-05-18
+## 0.1.1 - 2021-05-18
 
 - Fix bug with conflicting CUDA contexts
 
-## [0.1.0] - 2021-05-17
+## 0.1.0 - 2021-05-17
 
 - Initial publish for alpha testing
 
diff --git a/docs/changelog.md b/docs/changelog.md
new file mode 100644
index 000000000..4e68f7076
--- /dev/null
+++ b/docs/changelog.md
@@ -0,0 +1,8 @@
+---
+tocdepth: 2
+---
+
+<!--- This file simply includes the top-level changelog --->
+
+```{include} ../CHANGELOG.md
+```
diff --git a/docs/conf.py b/docs/conf.py
index 400d0c77f..f77e02ff2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,6 +34,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "myst_parser",  # Parse markdown files
     "sphinx.ext.autodoc",
     "sphinx.ext.napoleon",  # Convert docstrings to reStructuredText
     "sphinx.ext.intersphinx",
@@ -74,6 +75,11 @@
     "github": ("https://github.com/NVIDIA/warp/blob/main/%s", "%s"),
 }
 
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
 
 def linkcode_resolve(domain, info):
     """Tries to generate external links to code hosted on the Warp GitHub
diff --git a/docs/index.rst b/docs/index.rst
index be23fc3b6..ac2c209eb 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -356,6 +356,7 @@ Full Table of Contents
     limitations
     modules/contribution_guide
     faq
+    changelog
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/requirements.txt b/docs/requirements.txt
index b8b6bd59f..c8626adc7 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -3,3 +3,4 @@ sphinx==8.0.2
 sphinx_copybutton==0.5.2
 numpy==2.1.1
 ruff==0.6.8
+myst-parser==4.0.0
diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 82fb2e738..2792434bb 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -72,15 +72,14 @@
 
 - Bug fixes
   - Fix an aliasing issue with zero-copy array initialization from NumPy introduced in Warp 1.3.0.
-  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values.
+  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values ([GH-312](https://github.com/NVIDIA/warp/pull/312)).
 
 ## [1.3.2] - 2024-08-30
 
 - Bug fixes
   - Fix accuracy of 3x3 SVD ``wp.svd3`` with fp64 numbers ([GH-281](https://github.com/NVIDIA/warp/issues/281)).
   - Fix module hashing when a kernel argument contained a struct array ([GH-287](https://github.com/NVIDIA/warp/issues/287)).
-  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used
-  ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
+  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
   - Fix errors when launching a CUDA graph after a module is reloaded. Modules that were used during graph capture
     will no longer be unloaded before the graph is released.
   - Fix a bug in `wp.sim.collide.triangle_closest_point_barycentric()` where the returned barycentric coordinates may be
@@ -205,7 +204,7 @@
 - Fix for handling of `bool` types in generic kernels
 - Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
-## [1.1.1] - 2024-05-24
+## 1.1.1 - 2024-05-24
 
 - `wp.init()` is no longer required to be called explicitly and will be performed on first call to the API
 - Speed up `omni.warp.core`'s startup time
@@ -240,7 +239,7 @@
 - Support gradient propagation for indexing sliced multi-dimensional arrays, i.e. `a[i][j]` vs. `a[i, j]`
 - Provide an informative message if setting DLL C-types failed, instructing to try rebuilding the library
 
-## [1.0.3] - 2024-04-17
+## 1.0.3 - 2024-04-17
 
 - Add a `support_level` entry to the configuration file of the extensions
 
@@ -318,7 +317,7 @@
 - Added `wp.ones()` to efficiently create one-initialized arrays
 - Rename `wp.config.graph_capture_module_load_default` to `wp.config.enable_graph_capture_module_load_by_default`
 
-## [0.14.0] - 2024-02-19
+## 0.14.0 - 2024-02-19
 
 - Add support for CUDA pooled (stream-ordered) allocators
   - Support memory allocation during graph capture
@@ -355,7 +354,7 @@
   - Fixed a small CPU memory leak related to DLPack interop
 - Improved performance of creating arrays
 
-## [0.13.1] - 2024-02-22
+## 0.13.1 - 2024-02-22
 
 - Ensure that the results from the `Noise Deform` are deterministic across different Kit sessions
 
@@ -368,7 +367,7 @@
 - Add missing `.py` extension to `warp/tests/walkthrough_debug`
 - Allow `wp.bool` usage in vector and matrix types
 
-## [0.12.0] - 2024-02-05
+## 0.12.0 - 2024-02-05
 
 - Add a warning when the `enable_backward` setting is set to `False` upon calling `wp.Tape.backward()`
 - Fix kernels not being recompiled as expected when defined using a closure
@@ -384,7 +383,7 @@
   - Point releases (if any) go on the same minor release branch and only contain bug fixes, not new features.
   - The `public` branch, previously used to merge releases into and corresponding with the GitHub `main` branch, is retired.
 
-## [1.0.0-beta.7] - 2024-01-23
+## 1.0.0-beta.7 - 2024-01-23
 
 - Ensure captures are always enclosed in `try`/`finally`
 - Only include .py files from the warp subdirectory into wheel packages
@@ -446,7 +445,7 @@
 - Documentation update for `wp.BVH`
 - Documentation and simplified API for runtime kernel specialization `wp.Kernel`
 
-## [1.0.0-beta.4] - 2023-11-01
+## 1.0.0-beta.4 - 2023-11-01
 
 - Add `wp.cbrt()` for cube root calculation
 - Add `wp.mesh_furthest_point_no_sign()` to compute furthest point on a surface from a query point
@@ -458,7 +457,7 @@
 - Fix for `wp.utils.array_sum()` output initialization when used with vector types
 - Coverage and documentation updates
 
-## [1.0.0-beta.3] - 2023-10-19
+## 1.0.0-beta.3 - 2023-10-19
 
 - Add support for code coverage scans (test_coverage.py), coverage at 85% in `omni.warp.core`
 - Add support for named component access for vector types, e.g.: `a = v.x`
@@ -480,13 +479,13 @@
 
 - To support grid-stride kernels, `wp.tid()` can no longer be called inside `wp.func` functions.
 
-## [1.0.0-beta.2] - 2023-09-01
+## 1.0.0-beta.2 - 2023-09-01
 
 - Fix for passing bool into `wp.func` functions
 - Fix for deprecation warnings appearing on `stderr`, now redirected to `stdout`
 - Fix for using `for i in wp.hash_grid_query(..)` syntax
 
-## [1.0.0-beta.1] - 2023-08-29
+## 1.0.0-beta.1 - 2023-08-29
 
 - Fix for `wp.float16` being passed as kernel arguments
 - Fix for compile errors with kernels using structs in backward pass
@@ -525,7 +524,7 @@
 - Update margin used by for mesh queries when using `wp.sim.create_soft_body_contacts()`
 - Improvements to gradient handling with `wp.from_torch()`, `wp.to_torch()` plus documentation
 
-## [0.10.0] - 2023-07-05
+## 0.10.0 - 2023-07-05
 
 - Add support for macOS universal binaries (x86 + aarch64) for M1+ support
 - Add additional methods for SDF generation please see the following new methods:
@@ -601,7 +600,7 @@
 - Deprecate `wp.Model.soft_contact_distance` which is now replaced by `wp.Model.particle_radius`
 - Deprecate single scalar particle radius (should be a per-particle array)
 
-## [0.8.2] - 2023-04-21
+## 0.8.2 - 2023-04-21
 
 - Add `ModelBuilder.soft_contact_max` to control the maximum number of soft contacts that can be registered. Use `Model.allocate_soft_contacts(new_count)` to change count on existing `Model` objects.
 - Add support for `bool` parameters
@@ -612,12 +611,12 @@
 - Add sign determination using winding number of `wp.mesh_query_point()` as `wp.mesh_query_sign_winding_number()`
 - Add query point without sign determination `wp.mesh_query_no_sign()`
 
-## [0.8.1] - 2023-04-13
+## 0.8.1 - 2023-04-13
 
 - Fix for regression when passing flattened numeric lists as matrix arguments to kernels
 - Fix for regressions when passing `wp.struct` types with uninitialized (`None`) member attributes
 
-## [0.8.0] - 2023-04-05
+## 0.8.0 - 2023-04-05
 
 - Add `Texture Write` node for updating dynamic RTX textures from Warp kernels / nodes
 - Add multi-dimensional kernel support to Warp Kernel Node
@@ -661,14 +660,14 @@
 - `wp.sim.model.ground_plane` is now a `wp.array` to support gradient, users should call `builder.set_ground_plane()` to create the ground 
 - `wp.sim` capsule, cones, and cylinders are now aligned with the default USD up-axis
 
-## [0.7.2] - 2023-02-15
+## 0.7.2 - 2023-02-15
 
 - Reduce test time for vec/math types
 - Clean-up CUDA disabled build pipeline
 - Remove extension.gen.toml to make Kit packages Python version independent
 - Handle additional cases for array indexing inside Python
 
-## [0.7.1] - 2023-02-14
+## 0.7.1 - 2023-02-14
 
 - Disabling some slow tests for Kit
 - Make unit tests run on first GPU only by default
@@ -685,13 +684,13 @@
 - Add security pop-up for Kernel Node
 - Improve error handling for kernel return values
 
-## [0.6.3] - 2023-01-31
+## 0.6.3 - 2023-01-31
 
 - Add DLPack utilities, see `wp.from_dlpack()`, `wp.to_dlpack()`
 - Add Jax utilities, see `wp.from_jax()`, `wp.to_jax()`, `wp.device_from_jax()`, `wp.device_to_jax()`
 - Fix for Linux Kit extensions OM-80132, OM-80133
 
-## [0.6.2] - 2023-01-19
+## 0.6.2 - 2023-01-19
 
 - Updated `wp.from_torch()` to support more data types
 - Updated `wp.from_torch()` to automatically determine the target Warp data type if not specified
@@ -706,14 +705,14 @@
 - Replace Python `imp` package with `importlib`
 - Fix for quaternion slerp gradients (`wp.quat_slerp()`)
 
-## [0.6.1] - 2022-12-05
+## 0.6.1 - 2022-12-05
 
 - Fix for non-CUDA builds
 - Fix strides computation in array_t constructor, fixes a bug with accessing mesh indices through mesh.indices[]
 - Disable backward pass code generation for kernel node (4-6x faster compilation)
 - Switch to linbuild for universal Linux binaries (affects TeamCity builds only)
 
-## [0.6.0] - 2022-11-28
+## 0.6.0 - 2022-11-28
 
 - Add support for CUDA streams, see `wp.Stream`, `wp.get_stream()`, `wp.set_stream()`, `wp.synchronize_stream()`, `wp.ScopedStream`
 - Add support for CUDA events, see `wp.Event`, `wp.record_event()`, `wp.wait_event()`, `wp.wait_stream()`, `wp.Stream.record_event()`, `wp.Stream.wait_event()`, `wp.Stream.wait_stream()`
@@ -738,7 +737,7 @@
 - Fix various deployment issues by statically linking with all CUDA libs
 - Update warp.so/warp.dll to CUDA Toolkit 11.5
 
-## [0.5.1] - 2022-11-01
+## 0.5.1 - 2022-11-01
 
 - Fix for unit tests in Kit
 
@@ -775,14 +774,14 @@
 - Fix for arrays > 2GB in length
 - Add support for per-vertex USD mesh colors with `wp.render` class
 
-## [0.4.2] - 2022-09-07
+## 0.4.2 - 2022-09-07
 
 - Register Warp samples to the sample browser in Kit
 - Add NDEBUG flag to release mode kernel builds
 - Fix for particle solver node when using a large number of particles
 - Fix for broken cameras in Warp sample scenes
 
-## [0.4.1] - 2022-08-30
+## 0.4.1 - 2022-08-30
 
 - Add geometry sampling methods, see `wp.sample_unit_cube()`, `wp.sample_unit_disk()`, etc
 - Add `wp.lower_bound()` for searching sorted arrays
@@ -792,7 +791,7 @@
 - Fix for debug flags not being set correctly on CUDA when `wp.config.mode == "debug"`, this enables bounds checking on CUDA kernels in debug mode
 - Fix for code gen of functions that do not return a value
 
-## [0.4.0] - 2022-08-09
+## 0.4.0 - 2022-08-09
 
 - Fix for FP16 conversions on GPUs without hardware support
 - Fix for `runtime = None` errors when reloading the Warp module
@@ -809,7 +808,7 @@
 
 - Removed `wp.runtime` reference from the top-level module, as it should be considered private
 
-## [0.3.2] - 2022-07-19
+## 0.3.2 - 2022-07-19
 
 - Remove Torch import from `__init__.py`, defer import to `wp.from_torch()`, `wp.to_torch()`
 
@@ -831,7 +830,7 @@
 - `wp.synchronize()` now synchronizes all devices; for finer-grained control, use `wp.synchronize_device()`
 - Device alias `"cuda"` now refers to the current CUDA context, rather than a specific device like `"cuda:0"` or `"cuda:1"`
 
-## [0.3.0] - 2022-07-08
+## 0.3.0 - 2022-07-08
 
 - Add support for FP16 storage type, see `wp.float16`
 - Add support for per-dimension byte strides, see `wp.array.strides`
@@ -868,7 +867,7 @@
 - Tape `capture` option has been removed, users can now capture tapes inside existing CUDA graphs (e.g.: inside Torch)
 - Scalar loss arrays should now explicitly set `requires_grad=True` at creation time
 
-## [0.2.2] - 2022-05-30
+## 0.2.2 - 2022-05-30
 
 - Fix for `from import *` inside Warp initialization
 - Fix for body space velocity when using deforming Mesh objects with scale
@@ -892,7 +891,7 @@
 - Local `@wp.func` functions should not be namespaced when called, e.g.: previously `wp.myfunc()` would work even if `myfunc()` was not a builtin
 - Removed `wp.rpy2quat()`, please use `wp.quat_rpy()` instead
 
-## [0.2.1] - 2022-05-11
+## 0.2.1 - 2022-05-11
 
 - Fix for unit tests in Kit
 
@@ -941,7 +940,7 @@
 - `wp.array.length` member has been removed, please use `wp.array.shape` to access array dimensions, or use `wp.array.size` to get total element count
 - Marking `dense_gemm()`, `dense_chol()`, etc methods as experimental until we revisit them
 
-## [0.1.25] - 2022-03-20
+## 0.1.25 - 2022-03-20
 
 - Add support for class methods to be Warp kernels
 - Add HashGrid reserve() so it can be used with CUDA graphs
@@ -951,7 +950,7 @@
 - Add support for floored division on integer types
 - Move tests into core library so they can be run in Kit environment
 
-## [0.1.24] - 2022-03-03
+## 0.1.24 - 2022-03-03
 
 ### Warp Core
 
@@ -967,7 +966,7 @@
 - Fix for ranged for loops with negative step sizes
 - Fix for 3d and 4d spherical gradient distributions
 
-## [0.1.23] - 2022-02-17
+## 0.1.23 - 2022-02-17
 
 ### Warp Core
 
@@ -977,7 +976,7 @@
 - Add procedural noise primitives, see `wp.noise()`, `wp.pnoise()`, `wp.curlnoise()`
 - Move simulation helpers our of test into `wp.sim` module
 
-## [0.1.22] - 2022-02-14
+## 0.1.22 - 2022-02-14
 
 ### Warp Core
 
@@ -991,7 +990,7 @@
 
 - Add support for universal and compound joint types
 
-## [0.1.21] - 2022-01-19
+## 0.1.21 - 2022-01-19
 
 ### Warp Core
 
@@ -1011,19 +1010,19 @@
 - New OgnParticleVolume node for sampling shapes -> particles
 - New OgnParticleSolver node for DEM style granular materials
 
-## [0.1.20] - 2021-11-02
+## 0.1.20 - 2021-11-02
 
 - Updates to the ripple solver for GTC (support for multiple colliders, buoyancy, etc)
 
-## [0.1.19] - 2021-10-15
+## 0.1.19 - 2021-10-15
 
 - Publish from 2021.3 to avoid omni.graph database incompatibilities
 
-## [0.1.18] - 2021-10-08
+## 0.1.18 - 2021-10-08
 
 - Enable Linux support (tested on 20.04)
 
-## [0.1.17] - 2021-09-30
+## 0.1.17 - 2021-09-30
 
 - Fix for 3x3 SVD adjoint
 - Fix for A6000 GPU (bump compute model to sm_52 minimum)
@@ -1032,12 +1031,12 @@
 - Rename spatial_transform -> transform
 - Documentation update
 
-## [0.1.16] - 2021-09-06
+## 0.1.16 - 2021-09-06
 
 - Fix for case where simple assignments (a = b) incorrectly generated reference rather than value copy
 - Handle passing zero-length (empty) arrays to kernels
 
-## [0.1.15] - 2021-09-03
+## 0.1.15 - 2021-09-03
 
 - Add additional math library functions (asin, etc)
 - Add builtin 3x3 SVD support
@@ -1050,62 +1049,62 @@
 - Removes the need to transfer array to CPU before numpy conversion (will be done implicitly)
 - Update the example OgnRipple wave equation solver to use bundles
 
-## [0.1.14] - 2021-08-09
+## 0.1.14 - 2021-08-09
 
 - Fix for out-of-bounds memory access in CUDA BVH
 - Better error checking after kernel launches (use `wp.config.verify_cuda=True`)
 - Fix for vec3 normalize adjoint code
 
-## [0.1.13] - 2021-07-29
+## 0.1.13 - 2021-07-29
 
 - Remove OgnShrinkWrap.py test node
 
-## [0.1.12] - 2021-07-29
+## 0.1.12 - 2021-07-29
 
 - Switch to Woop et al.'s watertight ray-tri intersection test
 - Disable --fast-math in CUDA compilation step for improved precision
 
-## [0.1.11] - 2021-07-28
+## 0.1.11 - 2021-07-28
 
 - Fix for `wp.mesh_query_ray()` returning incorrect t-value
 
-## [0.1.10] - 2021-07-28
+## 0.1.10 - 2021-07-28
 
 - Fix for OV extension fwatcher filters to avoid hot-reload loop due to OGN regeneration
 
-## [0.1.9] - 2021-07-21
+## 0.1.9 - 2021-07-21
 
 - Fix for loading sibling DLL paths
 - Better type checking for built-in function arguments
 - Added runtime docs, can now list all builtins using `wp.print_builtins()`
 
-## [0.1.8] - 2021-07-14
+## 0.1.8 - 2021-07-14
 
 - Fix for hot-reload of CUDA kernels
 - Add Tape object for replaying differentiable kernels
 - Add helpers for Torch interop (convert `torch.Tensor` to `wp.Array`)
 
-## [0.1.7] - 2021-07-05
+## 0.1.7 - 2021-07-05
 
 - Switch to NVRTC for CUDA runtime
 - Allow running without host compiler
 - Disable asserts in kernel release mode (small perf. improvement)
 
-## [0.1.6] - 2021-06-14
+## 0.1.6 - 2021-06-14
 
 - Look for CUDA toolchain in target-deps
 
-## [0.1.5] - 2021-06-14
+## 0.1.5 - 2021-06-14
 
 - Rename OgLang -> Warp
 - Improve CUDA environment error checking
 - Clean-up some logging, add verbose mode (`wp.config.verbose`)
 
-## [0.1.4] - 2021-06-10
+## 0.1.4 - 2021-06-10
 
 - Add support for mesh raycast
 
-## [0.1.3] - 2021-06-09
+## 0.1.3 - 2021-06-09
 
 - Add support for unary negation operator
 - Add support for mutating variables during dynamic loops (non-differentiable)
@@ -1113,7 +1112,7 @@
 - Improve kernel cache start up times (avoids adjointing before cache check)
 - Update README.md with requirements / examples
 
-## [0.1.2] - 2021-06-03
+## 0.1.2 - 2021-06-03
 
 - Add support for querying mesh velocities
 - Add CUDA graph support, see `wp.capture_begin()`, `wp.capture_end()`, `wp.capture_launch()`
@@ -1123,10 +1122,10 @@
 
 - Fix for Linux/macOS support
 
-## [0.1.1] - 2021-05-18
+## 0.1.1 - 2021-05-18
 
 - Fix bug with conflicting CUDA contexts
 
-## [0.1.0] - 2021-05-17
+## 0.1.0 - 2021-05-17
 
 - Initial publish for alpha testing
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 82fb2e738..2792434bb 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -72,15 +72,14 @@
 
 - Bug fixes
   - Fix an aliasing issue with zero-copy array initialization from NumPy introduced in Warp 1.3.0.
-  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values.
+  - Fix `wp.Volume.load_from_numpy()` behavior when `bg_value` is a sequence of values ([GH-312](https://github.com/NVIDIA/warp/pull/312)).
 
 ## [1.3.2] - 2024-08-30
 
 - Bug fixes
   - Fix accuracy of 3x3 SVD ``wp.svd3`` with fp64 numbers ([GH-281](https://github.com/NVIDIA/warp/issues/281)).
   - Fix module hashing when a kernel argument contained a struct array ([GH-287](https://github.com/NVIDIA/warp/issues/287)).
-  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used
-  ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
+  - Fix a bug in `wp.bvh_query_ray()` where the direction instead of the reciprocal direction was used ([GH-288](https://github.com/NVIDIA/warp/issues/288)).
   - Fix errors when launching a CUDA graph after a module is reloaded. Modules that were used during graph capture
     will no longer be unloaded before the graph is released.
   - Fix a bug in `wp.sim.collide.triangle_closest_point_barycentric()` where the returned barycentric coordinates may be
@@ -205,7 +204,7 @@
 - Fix for handling of `bool` types in generic kernels
 - Publish CUDA 12.5 binaries for Hopper support, see https://github.com/nvidia/warp?tab=readme-ov-file#installing for details
 
-## [1.1.1] - 2024-05-24
+## 1.1.1 - 2024-05-24
 
 - `wp.init()` is no longer required to be called explicitly and will be performed on first call to the API
 - Speed up `omni.warp.core`'s startup time
@@ -240,7 +239,7 @@
 - Support gradient propagation for indexing sliced multi-dimensional arrays, i.e. `a[i][j]` vs. `a[i, j]`
 - Provide an informative message if setting DLL C-types failed, instructing to try rebuilding the library
 
-## [1.0.3] - 2024-04-17
+## 1.0.3 - 2024-04-17
 
 - Add a `support_level` entry to the configuration file of the extensions
 
@@ -318,7 +317,7 @@
 - Added `wp.ones()` to efficiently create one-initialized arrays
 - Rename `wp.config.graph_capture_module_load_default` to `wp.config.enable_graph_capture_module_load_by_default`
 
-## [0.14.0] - 2024-02-19
+## 0.14.0 - 2024-02-19
 
 - Add support for CUDA pooled (stream-ordered) allocators
   - Support memory allocation during graph capture
@@ -355,7 +354,7 @@
   - Fixed a small CPU memory leak related to DLPack interop
 - Improved performance of creating arrays
 
-## [0.13.1] - 2024-02-22
+## 0.13.1 - 2024-02-22
 
 - Ensure that the results from the `Noise Deform` are deterministic across different Kit sessions
 
@@ -368,7 +367,7 @@
 - Add missing `.py` extension to `warp/tests/walkthrough_debug`
 - Allow `wp.bool` usage in vector and matrix types
 
-## [0.12.0] - 2024-02-05
+## 0.12.0 - 2024-02-05
 
 - Add a warning when the `enable_backward` setting is set to `False` upon calling `wp.Tape.backward()`
 - Fix kernels not being recompiled as expected when defined using a closure
@@ -384,7 +383,7 @@
   - Point releases (if any) go on the same minor release branch and only contain bug fixes, not new features.
   - The `public` branch, previously used to merge releases into and corresponding with the GitHub `main` branch, is retired.
 
-## [1.0.0-beta.7] - 2024-01-23
+## 1.0.0-beta.7 - 2024-01-23
 
 - Ensure captures are always enclosed in `try`/`finally`
 - Only include .py files from the warp subdirectory into wheel packages
@@ -446,7 +445,7 @@
 - Documentation update for `wp.BVH`
 - Documentation and simplified API for runtime kernel specialization `wp.Kernel`
 
-## [1.0.0-beta.4] - 2023-11-01
+## 1.0.0-beta.4 - 2023-11-01
 
 - Add `wp.cbrt()` for cube root calculation
 - Add `wp.mesh_furthest_point_no_sign()` to compute furthest point on a surface from a query point
@@ -458,7 +457,7 @@
 - Fix for `wp.utils.array_sum()` output initialization when used with vector types
 - Coverage and documentation updates
 
-## [1.0.0-beta.3] - 2023-10-19
+## 1.0.0-beta.3 - 2023-10-19
 
 - Add support for code coverage scans (test_coverage.py), coverage at 85% in `omni.warp.core`
 - Add support for named component access for vector types, e.g.: `a = v.x`
@@ -480,13 +479,13 @@
 
 - To support grid-stride kernels, `wp.tid()` can no longer be called inside `wp.func` functions.
 
-## [1.0.0-beta.2] - 2023-09-01
+## 1.0.0-beta.2 - 2023-09-01
 
 - Fix for passing bool into `wp.func` functions
 - Fix for deprecation warnings appearing on `stderr`, now redirected to `stdout`
 - Fix for using `for i in wp.hash_grid_query(..)` syntax
 
-## [1.0.0-beta.1] - 2023-08-29
+## 1.0.0-beta.1 - 2023-08-29
 
 - Fix for `wp.float16` being passed as kernel arguments
 - Fix for compile errors with kernels using structs in backward pass
@@ -525,7 +524,7 @@
 - Update margin used by for mesh queries when using `wp.sim.create_soft_body_contacts()`
 - Improvements to gradient handling with `wp.from_torch()`, `wp.to_torch()` plus documentation
 
-## [0.10.0] - 2023-07-05
+## 0.10.0 - 2023-07-05
 
 - Add support for macOS universal binaries (x86 + aarch64) for M1+ support
 - Add additional methods for SDF generation please see the following new methods:
@@ -601,7 +600,7 @@
 - Deprecate `wp.Model.soft_contact_distance` which is now replaced by `wp.Model.particle_radius`
 - Deprecate single scalar particle radius (should be a per-particle array)
 
-## [0.8.2] - 2023-04-21
+## 0.8.2 - 2023-04-21
 
 - Add `ModelBuilder.soft_contact_max` to control the maximum number of soft contacts that can be registered. Use `Model.allocate_soft_contacts(new_count)` to change count on existing `Model` objects.
 - Add support for `bool` parameters
@@ -612,12 +611,12 @@
 - Add sign determination using winding number of `wp.mesh_query_point()` as `wp.mesh_query_sign_winding_number()`
 - Add query point without sign determination `wp.mesh_query_no_sign()`
 
-## [0.8.1] - 2023-04-13
+## 0.8.1 - 2023-04-13
 
 - Fix for regression when passing flattened numeric lists as matrix arguments to kernels
 - Fix for regressions when passing `wp.struct` types with uninitialized (`None`) member attributes
 
-## [0.8.0] - 2023-04-05
+## 0.8.0 - 2023-04-05
 
 - Add `Texture Write` node for updating dynamic RTX textures from Warp kernels / nodes
 - Add multi-dimensional kernel support to Warp Kernel Node
@@ -661,14 +660,14 @@
 - `wp.sim.model.ground_plane` is now a `wp.array` to support gradient, users should call `builder.set_ground_plane()` to create the ground 
 - `wp.sim` capsule, cones, and cylinders are now aligned with the default USD up-axis
 
-## [0.7.2] - 2023-02-15
+## 0.7.2 - 2023-02-15
 
 - Reduce test time for vec/math types
 - Clean-up CUDA disabled build pipeline
 - Remove extension.gen.toml to make Kit packages Python version independent
 - Handle additional cases for array indexing inside Python
 
-## [0.7.1] - 2023-02-14
+## 0.7.1 - 2023-02-14
 
 - Disabling some slow tests for Kit
 - Make unit tests run on first GPU only by default
@@ -685,13 +684,13 @@
 - Add security pop-up for Kernel Node
 - Improve error handling for kernel return values
 
-## [0.6.3] - 2023-01-31
+## 0.6.3 - 2023-01-31
 
 - Add DLPack utilities, see `wp.from_dlpack()`, `wp.to_dlpack()`
 - Add Jax utilities, see `wp.from_jax()`, `wp.to_jax()`, `wp.device_from_jax()`, `wp.device_to_jax()`
 - Fix for Linux Kit extensions OM-80132, OM-80133
 
-## [0.6.2] - 2023-01-19
+## 0.6.2 - 2023-01-19
 
 - Updated `wp.from_torch()` to support more data types
 - Updated `wp.from_torch()` to automatically determine the target Warp data type if not specified
@@ -706,14 +705,14 @@
 - Replace Python `imp` package with `importlib`
 - Fix for quaternion slerp gradients (`wp.quat_slerp()`)
 
-## [0.6.1] - 2022-12-05
+## 0.6.1 - 2022-12-05
 
 - Fix for non-CUDA builds
 - Fix strides computation in array_t constructor, fixes a bug with accessing mesh indices through mesh.indices[]
 - Disable backward pass code generation for kernel node (4-6x faster compilation)
 - Switch to linbuild for universal Linux binaries (affects TeamCity builds only)
 
-## [0.6.0] - 2022-11-28
+## 0.6.0 - 2022-11-28
 
 - Add support for CUDA streams, see `wp.Stream`, `wp.get_stream()`, `wp.set_stream()`, `wp.synchronize_stream()`, `wp.ScopedStream`
 - Add support for CUDA events, see `wp.Event`, `wp.record_event()`, `wp.wait_event()`, `wp.wait_stream()`, `wp.Stream.record_event()`, `wp.Stream.wait_event()`, `wp.Stream.wait_stream()`
@@ -738,7 +737,7 @@
 - Fix various deployment issues by statically linking with all CUDA libs
 - Update warp.so/warp.dll to CUDA Toolkit 11.5
 
-## [0.5.1] - 2022-11-01
+## 0.5.1 - 2022-11-01
 
 - Fix for unit tests in Kit
 
@@ -775,14 +774,14 @@
 - Fix for arrays > 2GB in length
 - Add support for per-vertex USD mesh colors with `wp.render` class
 
-## [0.4.2] - 2022-09-07
+## 0.4.2 - 2022-09-07
 
 - Register Warp samples to the sample browser in Kit
 - Add NDEBUG flag to release mode kernel builds
 - Fix for particle solver node when using a large number of particles
 - Fix for broken cameras in Warp sample scenes
 
-## [0.4.1] - 2022-08-30
+## 0.4.1 - 2022-08-30
 
 - Add geometry sampling methods, see `wp.sample_unit_cube()`, `wp.sample_unit_disk()`, etc
 - Add `wp.lower_bound()` for searching sorted arrays
@@ -792,7 +791,7 @@
 - Fix for debug flags not being set correctly on CUDA when `wp.config.mode == "debug"`, this enables bounds checking on CUDA kernels in debug mode
 - Fix for code gen of functions that do not return a value
 
-## [0.4.0] - 2022-08-09
+## 0.4.0 - 2022-08-09
 
 - Fix for FP16 conversions on GPUs without hardware support
 - Fix for `runtime = None` errors when reloading the Warp module
@@ -809,7 +808,7 @@
 
 - Removed `wp.runtime` reference from the top-level module, as it should be considered private
 
-## [0.3.2] - 2022-07-19
+## 0.3.2 - 2022-07-19
 
 - Remove Torch import from `__init__.py`, defer import to `wp.from_torch()`, `wp.to_torch()`
 
@@ -831,7 +830,7 @@
 - `wp.synchronize()` now synchronizes all devices; for finer-grained control, use `wp.synchronize_device()`
 - Device alias `"cuda"` now refers to the current CUDA context, rather than a specific device like `"cuda:0"` or `"cuda:1"`
 
-## [0.3.0] - 2022-07-08
+## 0.3.0 - 2022-07-08
 
 - Add support for FP16 storage type, see `wp.float16`
 - Add support for per-dimension byte strides, see `wp.array.strides`
@@ -868,7 +867,7 @@
 - Tape `capture` option has been removed, users can now capture tapes inside existing CUDA graphs (e.g.: inside Torch)
 - Scalar loss arrays should now explicitly set `requires_grad=True` at creation time
 
-## [0.2.2] - 2022-05-30
+## 0.2.2 - 2022-05-30
 
 - Fix for `from import *` inside Warp initialization
 - Fix for body space velocity when using deforming Mesh objects with scale
@@ -892,7 +891,7 @@
 - Local `@wp.func` functions should not be namespaced when called, e.g.: previously `wp.myfunc()` would work even if `myfunc()` was not a builtin
 - Removed `wp.rpy2quat()`, please use `wp.quat_rpy()` instead
 
-## [0.2.1] - 2022-05-11
+## 0.2.1 - 2022-05-11
 
 - Fix for unit tests in Kit
 
@@ -941,7 +940,7 @@
 - `wp.array.length` member has been removed, please use `wp.array.shape` to access array dimensions, or use `wp.array.size` to get total element count
 - Marking `dense_gemm()`, `dense_chol()`, etc methods as experimental until we revisit them
 
-## [0.1.25] - 2022-03-20
+## 0.1.25 - 2022-03-20
 
 - Add support for class methods to be Warp kernels
 - Add HashGrid reserve() so it can be used with CUDA graphs
@@ -951,7 +950,7 @@
 - Add support for floored division on integer types
 - Move tests into core library so they can be run in Kit environment
 
-## [0.1.24] - 2022-03-03
+## 0.1.24 - 2022-03-03
 
 ### Warp Core
 
@@ -967,7 +966,7 @@
 - Fix for ranged for loops with negative step sizes
 - Fix for 3d and 4d spherical gradient distributions
 
-## [0.1.23] - 2022-02-17
+## 0.1.23 - 2022-02-17
 
 ### Warp Core
 
@@ -977,7 +976,7 @@
 - Add procedural noise primitives, see `wp.noise()`, `wp.pnoise()`, `wp.curlnoise()`
 - Move simulation helpers our of test into `wp.sim` module
 
-## [0.1.22] - 2022-02-14
+## 0.1.22 - 2022-02-14
 
 ### Warp Core
 
@@ -991,7 +990,7 @@
 
 - Add support for universal and compound joint types
 
-## [0.1.21] - 2022-01-19
+## 0.1.21 - 2022-01-19
 
 ### Warp Core
 
@@ -1011,19 +1010,19 @@
 - New OgnParticleVolume node for sampling shapes -> particles
 - New OgnParticleSolver node for DEM style granular materials
 
-## [0.1.20] - 2021-11-02
+## 0.1.20 - 2021-11-02
 
 - Updates to the ripple solver for GTC (support for multiple colliders, buoyancy, etc)
 
-## [0.1.19] - 2021-10-15
+## 0.1.19 - 2021-10-15
 
 - Publish from 2021.3 to avoid omni.graph database incompatibilities
 
-## [0.1.18] - 2021-10-08
+## 0.1.18 - 2021-10-08
 
 - Enable Linux support (tested on 20.04)
 
-## [0.1.17] - 2021-09-30
+## 0.1.17 - 2021-09-30
 
 - Fix for 3x3 SVD adjoint
 - Fix for A6000 GPU (bump compute model to sm_52 minimum)
@@ -1032,12 +1031,12 @@
 - Rename spatial_transform -> transform
 - Documentation update
 
-## [0.1.16] - 2021-09-06
+## 0.1.16 - 2021-09-06
 
 - Fix for case where simple assignments (a = b) incorrectly generated reference rather than value copy
 - Handle passing zero-length (empty) arrays to kernels
 
-## [0.1.15] - 2021-09-03
+## 0.1.15 - 2021-09-03
 
 - Add additional math library functions (asin, etc)
 - Add builtin 3x3 SVD support
@@ -1050,62 +1049,62 @@
 - Removes the need to transfer array to CPU before numpy conversion (will be done implicitly)
 - Update the example OgnRipple wave equation solver to use bundles
 
-## [0.1.14] - 2021-08-09
+## 0.1.14 - 2021-08-09
 
 - Fix for out-of-bounds memory access in CUDA BVH
 - Better error checking after kernel launches (use `wp.config.verify_cuda=True`)
 - Fix for vec3 normalize adjoint code
 
-## [0.1.13] - 2021-07-29
+## 0.1.13 - 2021-07-29
 
 - Remove OgnShrinkWrap.py test node
 
-## [0.1.12] - 2021-07-29
+## 0.1.12 - 2021-07-29
 
 - Switch to Woop et al.'s watertight ray-tri intersection test
 - Disable --fast-math in CUDA compilation step for improved precision
 
-## [0.1.11] - 2021-07-28
+## 0.1.11 - 2021-07-28
 
 - Fix for `wp.mesh_query_ray()` returning incorrect t-value
 
-## [0.1.10] - 2021-07-28
+## 0.1.10 - 2021-07-28
 
 - Fix for OV extension fwatcher filters to avoid hot-reload loop due to OGN regeneration
 
-## [0.1.9] - 2021-07-21
+## 0.1.9 - 2021-07-21
 
 - Fix for loading sibling DLL paths
 - Better type checking for built-in function arguments
 - Added runtime docs, can now list all builtins using `wp.print_builtins()`
 
-## [0.1.8] - 2021-07-14
+## 0.1.8 - 2021-07-14
 
 - Fix for hot-reload of CUDA kernels
 - Add Tape object for replaying differentiable kernels
 - Add helpers for Torch interop (convert `torch.Tensor` to `wp.Array`)
 
-## [0.1.7] - 2021-07-05
+## 0.1.7 - 2021-07-05
 
 - Switch to NVRTC for CUDA runtime
 - Allow running without host compiler
 - Disable asserts in kernel release mode (small perf. improvement)
 
-## [0.1.6] - 2021-06-14
+## 0.1.6 - 2021-06-14
 
 - Look for CUDA toolchain in target-deps
 
-## [0.1.5] - 2021-06-14
+## 0.1.5 - 2021-06-14
 
 - Rename OgLang -> Warp
 - Improve CUDA environment error checking
 - Clean-up some logging, add verbose mode (`wp.config.verbose`)
 
-## [0.1.4] - 2021-06-10
+## 0.1.4 - 2021-06-10
 
 - Add support for mesh raycast
 
-## [0.1.3] - 2021-06-09
+## 0.1.3 - 2021-06-09
 
 - Add support for unary negation operator
 - Add support for mutating variables during dynamic loops (non-differentiable)
@@ -1113,7 +1112,7 @@
 - Improve kernel cache start up times (avoids adjointing before cache check)
 - Update README.md with requirements / examples
 
-## [0.1.2] - 2021-06-03
+## 0.1.2 - 2021-06-03
 
 - Add support for querying mesh velocities
 - Add CUDA graph support, see `wp.capture_begin()`, `wp.capture_end()`, `wp.capture_launch()`
@@ -1123,10 +1122,10 @@
 
 - Fix for Linux/macOS support
 
-## [0.1.1] - 2021-05-18
+## 0.1.1 - 2021-05-18
 
 - Fix bug with conflicting CUDA contexts
 
-## [0.1.0] - 2021-05-17
+## 0.1.0 - 2021-05-17
 
 - Initial publish for alpha testing

From bc87d6fc7ca35f1489b80e3761cadf3e0dab1afb Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 14 Oct 2024 14:22:48 -0700
Subject: [PATCH 15/18] Update CHANGELOG for 1.4.1

---
 CHANGELOG.md                          | 17 ++++++-----------
 exts/omni.warp.core/docs/CHANGELOG.md | 11 +++++++++++
 exts/omni.warp/docs/CHANGELOG.md      | 11 +++++++++++
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f4ad172cf..7c0e4a210 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,20 +1,15 @@
 # Changelog
 
-## [Unreleased] - 2024-??
-
-### Added
-
-### Changed
+## [1.4.1] - 2024-10-15
 
 ### Fixed
 
 - Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
-
-- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes
-- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`
-- Fix invalid code generation error messages when nesting dynamic and static for-loops
-- Fix caching of kernels with static expressions
-- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes/
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`.
+- Fix invalid code generation error messages when nesting dynamic and static for-loops.
+- Fix caching of kernels with static expressions.
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 2792434bb..195391235 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -1,5 +1,16 @@
 # CHANGELOG
 
+## [1.4.1] - 2024-10-15
+
+### Fixed
+
+- Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes/
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`.
+- Fix invalid code generation error messages when nesting dynamic and static for-loops.
+- Fix caching of kernels with static expressions.
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+
 ## [1.4.0] - 2024-10-01
 
 ### Added
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 2792434bb..195391235 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -1,5 +1,16 @@
 # CHANGELOG
 
+## [1.4.1] - 2024-10-15
+
+### Fixed
+
+- Fix `iter_reverse()` not working as expected for ranges with steps other than 1 ([GH-311](https://github.com/NVIDIA/warp/issues/311)).
+- Fix potential out-of-bounds memory access when a `wp.sparse.BsrMatrix` object is reused for storing matrices of different shapes/
+- Fix robustness to very low desired tolerance in `wp.fem.utils.symmetric_eigenvalues_qr`.
+- Fix invalid code generation error messages when nesting dynamic and static for-loops.
+- Fix caching of kernels with static expressions.
+- Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+
 ## [1.4.0] - 2024-10-01
 
 ### Added

From 01da19c662ba91bf6540e02b829c8eef05cc88d3 Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 14 Oct 2024 14:24:16 -0700
Subject: [PATCH 16/18] Bump version to 1.4.1

---
 CHANGELOG.md                              | 3 ++-
 README.md                                 | 6 +++---
 VERSION.md                                | 2 +-
 docs/installation.rst                     | 6 +++---
 exts/omni.warp.core/config/extension.toml | 2 +-
 exts/omni.warp/config/extension.toml      | 4 ++--
 warp/config.py                            | 2 +-
 7 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c0e4a210..0e87e683b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1141,7 +1141,8 @@
 
 - Initial publish for alpha testing
 
-[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.4.0...HEAD
+[Unreleased]: https://github.com/NVIDIA/warp/compare/v1.4.1...HEAD
+[1.4.1]: https://github.com/NVIDIA/warp/releases/tag/v1.4.1
 [1.4.0]: https://github.com/NVIDIA/warp/releases/tag/v1.4.0
 [1.3.3]: https://github.com/NVIDIA/warp/releases/tag/v1.3.3
 [1.3.2]: https://github.com/NVIDIA/warp/releases/tag/v1.3.2
diff --git a/README.md b/README.md
index 54c1bbfd9..ac8a11dce 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,9 @@ the `pip install` command, e.g.
 
 | Platform        | Install Command                                                                                                               |
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_aarch64.whl` |
-| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_x86_64.whl`  |
-| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-win_amd64.whl`             |
+| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_aarch64.whl` |
+| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_x86_64.whl`  |
+| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-win_amd64.whl`             |
 
 The `--force-reinstall` option may need to be used to overwrite a previous installation.
 
diff --git a/VERSION.md b/VERSION.md
index 88c5fb891..347f5833e 100644
--- a/VERSION.md
+++ b/VERSION.md
@@ -1 +1 @@
-1.4.0
+1.4.1
diff --git a/docs/installation.rst b/docs/installation.rst
index b432a3263..3e2e6354e 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -25,11 +25,11 @@ the ``pip install`` command, e.g.
    * - Platform
      - Install Command
    * - Linux aarch64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_aarch64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_aarch64.whl``
    * - Linux x86-64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-manylinux2014_x86_64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-manylinux2014_x86_64.whl``
    * - Windows x86-64
-     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.0/warp_lang-1.4.0+cu11-py3-none-win_amd64.whl``
+     - ``pip install https://github.com/NVIDIA/warp/releases/download/v1.4.1/warp_lang-1.4.1+cu11-py3-none-win_amd64.whl``
 
 The ``--force-reinstall`` option may need to be used to overwrite a previous installation.
 
diff --git a/exts/omni.warp.core/config/extension.toml b/exts/omni.warp.core/config/extension.toml
index 841caf50a..04df653c2 100644
--- a/exts/omni.warp.core/config/extension.toml
+++ b/exts/omni.warp.core/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.4.0"
+version = "1.4.1"
 authors = ["NVIDIA"]
 title = "Warp Core"
 description="The core Warp Python module"
diff --git a/exts/omni.warp/config/extension.toml b/exts/omni.warp/config/extension.toml
index cfebd3b6e..46985a758 100644
--- a/exts/omni.warp/config/extension.toml
+++ b/exts/omni.warp/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.4.0"
+version = "1.4.1"
 authors = ["NVIDIA"]
 title = "Warp"
 description="Warp OmniGraph Nodes and Sample Scenes"
@@ -35,7 +35,7 @@ exclude = ["Ogn*Database.py", "*/ogn*"]
 "omni.timeline" = {}
 "omni.ui" = {optional = true}
 "omni.usd" = {}
-"omni.warp.core" = {version = "1.4.0", exact = true}
+"omni.warp.core" = {version = "1.4.1", exact = true}
 
 [[python.module]]
 name = "omni.warp._extension"
diff --git a/warp/config.py b/warp/config.py
index 49df51ea6..a703df21f 100644
--- a/warp/config.py
+++ b/warp/config.py
@@ -7,7 +7,7 @@
 
 from typing import Optional
 
-version: str = "1.4.0"
+version: str = "1.4.1"
 """Warp version string"""
 
 verify_fp: bool = False

From 46c9139fbeddef7cee6273551b925df0cb49dec2 Mon Sep 17 00:00:00 2001
From: Christopher Crouzet <christopher@crouzet.pm>
Date: Tue, 15 Oct 2024 07:41:45 +1300
Subject: [PATCH 17/18] Readd random number built-ins to Python's runtime

This reverts commit 25861dcc75b3f6e1d953b1be493cab5ee29e8ad5.
---
 CHANGELOG.md          |  1 +
 warp/builtins.py      | 24 +-----------------------
 warp/native/exports.h | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e87e683b..f33378af0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Fix invalid code generation error messages when nesting dynamic and static for-loops.
 - Fix caching of kernels with static expressions.
 - Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+- Re-introduced the `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` onto the Python scope to revert a breaking change.
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/warp/builtins.py b/warp/builtins.py
index 4f0bf6551..5793d3378 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -2758,7 +2758,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "rand_init",
     input_types={"seed": int},
     value_type=uint32,
-    export=False,
     group="Random",
     doc="Initialize a new random number generator given a user-defined seed. Returns a 32-bit integer representing the RNG state.",
 )
@@ -2767,7 +2766,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "rand_init",
     input_types={"seed": int, "offset": int},
     value_type=uint32,
-    export=False,
     group="Random",
     doc="""Initialize a new random number generator given a user-defined seed and an offset.
 
@@ -2779,7 +2777,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randi",
     input_types={"state": uint32},
     value_type=int,
-    export=False,
     group="Random",
     doc="Return a random integer in the range [0, 2^32).",
 )
@@ -2787,7 +2784,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randi",
     input_types={"state": uint32, "low": int, "high": int},
     value_type=int,
-    export=False,
     group="Random",
     doc="Return a random integer between [low, high).",
 )
@@ -2795,7 +2791,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randf",
     input_types={"state": uint32},
     value_type=float,
-    export=False,
     group="Random",
     doc="Return a random float between [0.0, 1.0).",
 )
@@ -2803,24 +2798,17 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "randf",
     input_types={"state": uint32, "low": float, "high": float},
     value_type=float,
-    export=False,
     group="Random",
     doc="Return a random float between [low, high).",
 )
 add_builtin(
-    "randn",
-    input_types={"state": uint32},
-    value_type=float,
-    export=False,
-    group="Random",
-    doc="Sample a normal distribution.",
+    "randn", input_types={"state": uint32}, value_type=float, group="Random", doc="Sample a normal distribution."
 )
 
 add_builtin(
     "sample_cdf",
     input_types={"state": uint32, "cdf": array(dtype=float)},
     value_type=int,
-    export=False,
     group="Random",
     doc="Inverse-transform sample a cumulative distribution function.",
 )
@@ -2828,7 +2816,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_triangle",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a triangle. Returns sample barycentric coordinates.",
 )
@@ -2836,7 +2823,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_ring",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a ring in the xy plane.",
 )
@@ -2844,7 +2830,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_disk",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a disk in the xy plane.",
 )
@@ -2852,7 +2837,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_sphere_surface",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit sphere surface.",
 )
@@ -2860,7 +2844,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_sphere",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit sphere.",
 )
@@ -2868,7 +2851,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_hemisphere_surface",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit hemisphere surface.",
 )
@@ -2876,7 +2858,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_hemisphere",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit hemisphere.",
 )
@@ -2884,7 +2865,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_square",
     input_types={"state": uint32},
     value_type=vec2,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit square.",
 )
@@ -2892,7 +2872,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "sample_unit_cube",
     input_types={"state": uint32},
     value_type=vec3,
-    export=False,
     group="Random",
     doc="Uniformly sample a unit cube.",
 )
@@ -2901,7 +2880,6 @@ def volume_sample_grad_index_value_func(arg_types: Mapping[str, type], arg_value
     "poisson",
     input_types={"state": uint32, "lam": float},
     value_type=uint32,
-    export=False,
     group="Random",
     doc="""Generate a random sample from a Poisson distribution.
 
diff --git a/warp/native/exports.h b/warp/native/exports.h
index 177780565..f8fd82af4 100644
--- a/warp/native/exports.h
+++ b/warp/native/exports.h
@@ -1013,6 +1013,23 @@ WP_API void builtin_volume_index_to_world_uint64_vec3f(uint64 id, vec3f& uvw, ve
 WP_API void builtin_volume_world_to_index_uint64_vec3f(uint64 id, vec3f& xyz, vec3f* ret) { *ret = wp::volume_world_to_index(id, xyz); }
 WP_API void builtin_volume_index_to_world_dir_uint64_vec3f(uint64 id, vec3f& uvw, vec3f* ret) { *ret = wp::volume_index_to_world_dir(id, uvw); }
 WP_API void builtin_volume_world_to_index_dir_uint64_vec3f(uint64 id, vec3f& xyz, vec3f* ret) { *ret = wp::volume_world_to_index_dir(id, xyz); }
+WP_API void builtin_rand_init_int32(int32 seed, uint32* ret) { *ret = wp::rand_init(seed); }
+WP_API void builtin_rand_init_int32_int32(int32 seed, int32 offset, uint32* ret) { *ret = wp::rand_init(seed, offset); }
+WP_API void builtin_randi_uint32(uint32 state, int* ret) { *ret = wp::randi(state); }
+WP_API void builtin_randi_uint32_int32_int32(uint32 state, int32 low, int32 high, int* ret) { *ret = wp::randi(state, low, high); }
+WP_API void builtin_randf_uint32(uint32 state, float* ret) { *ret = wp::randf(state); }
+WP_API void builtin_randf_uint32_float32_float32(uint32 state, float32 low, float32 high, float* ret) { *ret = wp::randf(state, low, high); }
+WP_API void builtin_randn_uint32(uint32 state, float* ret) { *ret = wp::randn(state); }
+WP_API void builtin_sample_triangle_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_triangle(state); }
+WP_API void builtin_sample_unit_ring_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_unit_ring(state); }
+WP_API void builtin_sample_unit_disk_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_unit_disk(state); }
+WP_API void builtin_sample_unit_sphere_surface_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_sphere_surface(state); }
+WP_API void builtin_sample_unit_sphere_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_sphere(state); }
+WP_API void builtin_sample_unit_hemisphere_surface_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_hemisphere_surface(state); }
+WP_API void builtin_sample_unit_hemisphere_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_hemisphere(state); }
+WP_API void builtin_sample_unit_square_uint32(uint32 state, vec2f* ret) { *ret = wp::sample_unit_square(state); }
+WP_API void builtin_sample_unit_cube_uint32(uint32 state, vec3f* ret) { *ret = wp::sample_unit_cube(state); }
+WP_API void builtin_poisson_uint32_float32(uint32 state, float32 lam, uint32* ret) { *ret = wp::poisson(state, lam); }
 WP_API void builtin_noise_uint32_float32(uint32 state, float32 x, float* ret) { *ret = wp::noise(state, x); }
 WP_API void builtin_noise_uint32_vec2f(uint32 state, vec2f& xy, float* ret) { *ret = wp::noise(state, xy); }
 WP_API void builtin_noise_uint32_vec3f(uint32 state, vec3f& xyz, float* ret) { *ret = wp::noise(state, xyz); }

From b1309f1c82a76d5e67a8d23fd9e7260f36c00ffb Mon Sep 17 00:00:00 2001
From: Eric Shi <ershi@nvidia.com>
Date: Mon, 14 Oct 2024 14:26:58 -0700
Subject: [PATCH 18/18] Synchronize changelogs

---
 exts/omni.warp.core/docs/CHANGELOG.md | 1 +
 exts/omni.warp/docs/CHANGELOG.md      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 195391235..87897c2b9 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Fix invalid code generation error messages when nesting dynamic and static for-loops.
 - Fix caching of kernels with static expressions.
 - Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+- Re-introduced the `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` onto the Python scope to revert a breaking change.
 
 ## [1.4.0] - 2024-10-01
 
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 195391235..87897c2b9 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -10,6 +10,7 @@
 - Fix invalid code generation error messages when nesting dynamic and static for-loops.
 - Fix caching of kernels with static expressions.
 - Fix `ModelBuilder.add_builder(builder)` to correctly update `articulation_start` and thereby `articulation_count` when `builder` contains more than one articulation.
+- Re-introduced the `wp.rand*()`, `wp.sample*()`, and `wp.poisson()` onto the Python scope to revert a breaking change.
 
 ## [1.4.0] - 2024-10-01