Skip to content

Commit

Permalink
format: use character semantics for length and index operations on st…
Browse files Browse the repository at this point in the history
…rings (#2921)

Conceptually, strings (as in Python) are sequences of characters. Use
character-based semantics when computing the length of a string or
looking up what is in a string at a particular offset.

This more closely matches Python's behaviour for these operations:

```
Python 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> print(len("line"))
4
>>> print(len("línea"))
6
>>> print(len("линия"))
5
>>> print(len("خط"))
2
```

Fixes #2920.
  • Loading branch information
chrisnovakovic authored Oct 12, 2023
1 parent c97dc5b commit 1643b23
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/parse/asp/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ func objLen(obj pyObject) pyInt {
case pyFrozenDict:
return pyInt(len(t.pyDict))
case pyString:
return pyInt(len(t))
return pyInt(len([]rune(t)))
}
panic("object of type " + obj.Type() + " has no len()")
}
Expand Down
15 changes: 15 additions & 0 deletions src/parse/asp/interpreter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,21 @@ func TestInterpreterLen(t *testing.T) {
s, err := parseFile("src/parse/asp/test_data/interpreter/len.build")
assert.NoError(t, err)
assert.EqualValues(t, "sync", s.Lookup("y"))
assert.EqualValues(t, 4, s.Lookup("l1"))
assert.EqualValues(t, 6, s.Lookup("l2"))
assert.EqualValues(t, 5, s.Lookup("l3"))
assert.EqualValues(t, 2, s.Lookup("l4"))
}

func TestInterpreterIndex(t *testing.T) {
t.Run("String indexing", func(t *testing.T) {
s, err := parseFile("src/parse/asp/test_data/interpreter/index_string.build")
assert.NoError(t, err)
assert.EqualValues(t, pyString("l"), s.Lookup("c1"))
assert.EqualValues(t, pyString("n"), s.Lookup("c2"))
assert.EqualValues(t, pyString("\u043d"), s.Lookup("c3"))
assert.EqualValues(t, pyString("\u0637"), s.Lookup("c4"))
})
}

func TestInterpreterFStringDollars(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion src/parse/asp/objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ func (s pyString) Operator(operator Operator, operand pyObject) pyObject {
case NotIn:
return newPyBool(!strings.Contains(string(s), string(s2)))
case Index:
return pyString(s[pyIndex(s, operand, false)])
return pyString([]rune(s)[pyIndex(s, operand, false)])
}
panic("Unknown operator for string")
}
Expand Down
28 changes: 28 additions & 0 deletions src/parse/asp/test_data/interpreter/index_string.build
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# U+006C : LATIN SMALL LETTER L
# U+0069 : LATIN SMALL LETTER I
# U+006E : LATIN SMALL LETTER N
# U+0065 : LATIN SMALL LETTER E
s1 = "line"
c1 = s1[0]

# U+006C : LATIN SMALL LETTER L
# U+0069 : LATIN SMALL LETTER I
# U+0301 : COMBINING ACUTE ACCENT {stress mark; Greek oxia, tonos}
# U+006E : LATIN SMALL LETTER N
# U+0065 : LATIN SMALL LETTER E
# U+0061 : LATIN SMALL LETTER A
s2 = "línea"
c2 = s2[3]

# U+043B : CYRILLIC SMALL LETTER EL
# U+0438 : CYRILLIC SMALL LETTER I
# U+043D : CYRILLIC SMALL LETTER EN
# U+0438 : CYRILLIC SMALL LETTER I
# U+044F : CYRILLIC SMALL LETTER YA
s3 = "линия"
c3 = s3[2]

# U+062E : ARABIC LETTER KHAH
# U+0637 : ARABIC LETTER TAH
s4 = "خط"
c4 = s4[1]
29 changes: 29 additions & 0 deletions src/parse/asp/test_data/interpreter/len.build
Original file line number Diff line number Diff line change
@@ -1,2 +1,31 @@
x = 'golang.org/x/sync'
y = x[len('golang.org/x/'):]

# U+006C : LATIN SMALL LETTER L
# U+0069 : LATIN SMALL LETTER I
# U+006E : LATIN SMALL LETTER N
# U+0065 : LATIN SMALL LETTER E
s1 = "line"
l1 = len(s1)

# U+006C : LATIN SMALL LETTER L
# U+0069 : LATIN SMALL LETTER I
# U+0301 : COMBINING ACUTE ACCENT {stress mark; Greek oxia, tonos}
# U+006E : LATIN SMALL LETTER N
# U+0065 : LATIN SMALL LETTER E
# U+0061 : LATIN SMALL LETTER A
s2 = "línea"
l2 = len(s2)

# U+043B : CYRILLIC SMALL LETTER EL
# U+0438 : CYRILLIC SMALL LETTER I
# U+043D : CYRILLIC SMALL LETTER EN
# U+0438 : CYRILLIC SMALL LETTER I
# U+044F : CYRILLIC SMALL LETTER YA
s3 = "линия"
l3 = len(s3)

# U+062E : ARABIC LETTER KHAH
# U+0637 : ARABIC LETTER TAH
s4 = "خط"
l4 = len(s4)

0 comments on commit 1643b23

Please sign in to comment.