Skip to content

Commit

Permalink
strings2arrays: make work again for sequences of inequal length
Browse files Browse the repository at this point in the history
PR explosion#897 fixed the dtypes in strings2arrays, however also broke
strings2arrays for batches with sequences if inequal lengths.
  • Loading branch information
danieldk committed Jan 9, 2024
1 parent e570a1a commit 5f5367a
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions thinc/layers/strings2arrays.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Callable, List, Sequence, Tuple
from ctypes import c_uint64

from murmurhash import hash_unicode

Expand All @@ -17,8 +18,10 @@ def strings2arrays() -> Model[InT, OutT]:


def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]:
hashes = [[hash_unicode(word) for word in X] for X in Xs]
hash_arrays = [model.ops.asarray2i(h, dtype="uint64") for h in hashes]
# Cast 32-bit (signed) integer to 64-bit unsigned, since such casting
# is deprecated in NumPy.
hashes = [[c_uint64(hash_unicode(word)).value for word in X] for X in Xs]
hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes]
arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays]

def backprop(dX: OutT) -> InT:
Expand Down

0 comments on commit 5f5367a

Please sign in to comment.