From d34f536ea6cb2df6ad2f72e1e5b7511aafe3c66d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 9 Jan 2024 10:24:20 +0100 Subject: [PATCH] strings2arrays: make work again for sequences of inequal length PR #897 fixed the dtypes in strings2arrays, however also broke strings2arrays for batches with sequences if inequal lengths. --- thinc/layers/strings2arrays.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py index 91a6b1a31..eba2c983d 100644 --- a/thinc/layers/strings2arrays.py +++ b/thinc/layers/strings2arrays.py @@ -1,3 +1,4 @@ +from ctypes import c_uint64 from typing import Callable, List, Sequence, Tuple from murmurhash import hash_unicode @@ -17,8 +18,10 @@ def strings2arrays() -> Model[InT, OutT]: def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]: - hashes = [[hash_unicode(word) for word in X] for X in Xs] - hash_arrays = [model.ops.asarray2i(h, dtype="uint64") for h in hashes] + # Cast 32-bit (signed) integer to 64-bit unsigned, since such casting + # is deprecated in NumPy. + hashes = [[c_uint64(hash_unicode(word)).value for word in X] for X in Xs] + hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes] arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays] def backprop(dX: OutT) -> InT: