Skip to content

Commit

Permalink
format: use character semantics for length and index operations on st…
Browse files Browse the repository at this point in the history
…rings

Conceptually, strings (as in Python) are sequences of characters. Use
character-based semantics when computing the length of a string or
looking up what is in a string at a particular offset.

This more closely matches Python's behaviour for these operations:

```
Python 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> print(len("line"))
4
>>> print(len("línea"))
6
>>> print(len("линия"))
5
>>> print(len("خط"))
2
```

Fixes #2920.
  • Loading branch information
chrisnovakovic committed Oct 11, 2023
1 parent c97dc5b commit 5a57d14
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/parse/asp/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ func objLen(obj pyObject) pyInt {
case pyFrozenDict:
return pyInt(len(t.pyDict))
case pyString:
return pyInt(len(t))
return pyInt(len([]rune(t)))
}
panic("object of type " + obj.Type() + " has no len()")
}
Expand Down
8 changes: 8 additions & 0 deletions src/parse/asp/interpreter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,14 @@ func TestInterpreterLen(t *testing.T) {
s, err := parseFile("src/parse/asp/test_data/interpreter/len.build")
assert.NoError(t, err)
assert.EqualValues(t, "sync", s.Lookup("y"))
assert.EqualValues(t, 4, s.Lookup("l1"))
assert.EqualValues(t, "l", s.Lookup("c1"))
assert.EqualValues(t, 6, s.Lookup("l2"))
assert.EqualValues(t, "n", s.Lookup("c2"))
assert.EqualValues(t, 5, s.Lookup("l3"))
assert.EqualValues(t, "н", s.Lookup("c3"))
assert.EqualValues(t, 2, s.Lookup("l4"))
assert.EqualValues(t, "ط", s.Lookup("c4"))
}

func TestInterpreterFStringDollars(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion src/parse/asp/objects.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ func (s pyString) Operator(operator Operator, operand pyObject) pyObject {
case NotIn:
return newPyBool(!strings.Contains(string(s), string(s2)))
case Index:
return pyString(s[pyIndex(s, operand, false)])
return pyString([]rune(s)[pyIndex(s, operand, false)])
}
panic("Unknown operator for string")
}
Expand Down
33 changes: 33 additions & 0 deletions src/parse/asp/test_data/interpreter/len.build
Original file line number Diff line number Diff line change
@@ -1,2 +1,35 @@
x = 'golang.org/x/sync'
y = x[len('golang.org/x/'):]

# U+006C : LATIN SMALL LETTER L
# U+0069 : LATIN SMALL LETTER I
# U+006E : LATIN SMALL LETTER N
# U+0065 : LATIN SMALL LETTER E
s1 = "line"
l1 = len(s1)
c1 = s1[0]

# U+006C : LATIN SMALL LETTER L
# U+0069 : LATIN SMALL LETTER I
# U+0301 : COMBINING ACUTE ACCENT {stress mark; Greek oxia, tonos}
# U+006E : LATIN SMALL LETTER N
# U+0065 : LATIN SMALL LETTER E
# U+0061 : LATIN SMALL LETTER A
s2 = "línea"
l2 = len(s2)
c2 = s2[3]

# U+043B : CYRILLIC SMALL LETTER EL
# U+0438 : CYRILLIC SMALL LETTER I
# U+043D : CYRILLIC SMALL LETTER EN
# U+0438 : CYRILLIC SMALL LETTER I
# U+044F : CYRILLIC SMALL LETTER YA
s3 = "линия"
l3 = len(s3)
c3 = s3[2]

# U+062E : ARABIC LETTER KHAH
# U+0637 : ARABIC LETTER TAH
s4 = "خط"
l4 = len(s4)
c4 = s4[1]

0 comments on commit 5a57d14

Please sign in to comment.