From baac1fd61b7c46f20f92bed9498b0f39f38bbfa9 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Fri, 12 Nov 2021 16:10:16 +0100 Subject: [PATCH] Fix another string sort bug --- src/compiler/passes/tokenize.fut | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/compiler/passes/tokenize.fut b/src/compiler/passes/tokenize.fut index cf37c80..fe3c665 100644 --- a/src/compiler/passes/tokenize.fut +++ b/src/compiler/passes/tokenize.fut @@ -45,19 +45,21 @@ local let parse_float (input: []u8) ((_, offset, len): tokenref): f32 = -- IDs are assigned sequentially starting from 0. local let link_names [n] (input: []u8) (tokens: [n]tokenref): [n]u32 = let (_, offsets, lengths) = unzip3 tokens - -- a-zA-Z0-9_ are 26 + 26 + 10 + 1 = 63 characters, so 6 bits will do. + -- a-zA-Z0-9_ are 26 + 26 + 10 + 1= 63 characters, plus one for the out-of-bounds value, so 6 bits will do nicely. let bits_per_char = 6 -- Map characters allowed in a function name to its 6-bit representation. + -- Note: This function maps a char to 1-63 instead of 0-62, as the out-of-bounds value (0) needs to + -- compare different than any of the other characters in order to obtain a good sorting. let char_to_value (c: u8): u8 = - if c >= 'a' && c <= 'z' then c - 'a' - else if c >= 'A' && c <= 'Z' then c - 'A' + ('z' - 'a' + 1) - else if c >= '0' && c <= '0' then c - '0' + ('z' - 'a' + 1) + ('Z' - 'A' + 1) - else c - '_' + ('z' - 'a' + 1) + ('Z' - 'A' + 1) + ('9' - '0' + 1) + if c >= 'a' && c <= 'z' then c - 'a' + 1 + else if c >= 'A' && c <= 'Z' then c - 'A' + 27 + else if c >= '0' && c <= '9' then c - '0' + 53 + else 63 -- '_' -- Get a particular bit in the string at `index`. let get_name_bit (bit: i32) (index: i32): i32 = let bit_in_char = bit % bits_per_char let byte_in_string = bit / bits_per_char - in if byte_in_string >= lengths[index] then 0 else + in if byte_in_string >= lengths[index] then 0 else -- Return 0 if out of bounds. let c = input[offsets[index] + byte_in_string] in u8.get_bit bit_in_char (char_to_value c) -- To finally assign an ID to ever string, we need to know whether it is equal to another string. @@ -102,7 +104,7 @@ local let link_names [n] (input: []u8) (tokens: [n]tokenref): [n]u32 = let tokenize (input: []u8) (lt: lex_table []) = lexer.lex input lt -- Filter out tokens whitespace tokens (which should be ignored by the parser). - |> filter (\(t, _, _) -> t != token_whitespace && t != token_comment && t != token_binary_minus_whitespace) + |> filter (\(t, _, _) -> t != token_whitespace && t != token_comment && t != token_binary_minus_whitespace) -- | This function builds a data vector for the token types, containing the following elements: -- - For each atom_name, a unique 32-bit integer for the name associated to the atom.