diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/RegionUtils.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/RegionUtils.kt index 279f3f63fa..e43c3d35c9 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/RegionUtils.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/RegionUtils.kt @@ -27,6 +27,7 @@ package de.fraunhofer.aisec.cpg.helpers import de.fraunhofer.aisec.cpg.frontends.LanguageFrontend import de.fraunhofer.aisec.cpg.sarif.Region +import kotlin.math.min import org.apache.commons.lang3.StringUtils /** @@ -62,12 +63,17 @@ fun getCodeOfSubregion(code: String, nodeRegion: Region, subRegion: Region): Str (StringUtils.ordinalIndexOf(code, nlType, subRegion.startLine - nodeRegion.startLine) + subRegion.startColumn) } - val end = + var end = if (subRegion.endLine == nodeRegion.startLine) { subRegion.endColumn - nodeRegion.startColumn } else { (StringUtils.ordinalIndexOf(code, nlType, subRegion.endLine - nodeRegion.startLine) + subRegion.endColumn) } + + // Unfortunately, we sometimes have issues with (non)-Unicode characters in code, where the + // python AST thinks that multiple characters are needed and reports a position that is actually + // beyond our "end" + end = min(end, code.length) return code.substring(start, end) } diff --git a/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt b/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt index f68284a5de..db35a4063d 100644 --- a/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt +++ b/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt @@ -1453,6 +1453,33 @@ class PythonFrontendTest : BaseTest() { assertEquals(4.toLong(), rhs.evaluate()) } + @Test + fun testParseWithUnicode() { + val topLevel = Path.of("src", "test", "resources", "python") + val tu = + analyzeAndGetFirstTU(listOf(topLevel.resolve("unicode.py").toFile()), topLevel, true) { + it.registerLanguage() + } + assertNotNull(tu) + + val normalFunc = tu.functions["normal_func"] + assertNotNull(normalFunc) + // 11 chars (including whitespace) -> SARIF position = 12 + // e = "e" + assertEquals(12, normalFunc.body?.location?.region?.endColumn) + + val unicodeFunc = tu.functions["unicode_func"] + assertNotNull(unicodeFunc) + + // also 11 chars (including whitespace) -> SARIF position = 12 + // But the python parser somehow sees these as two bytes so the position is 13 :( + // e = "é" + assertEquals(13, unicodeFunc.body?.location?.region?.endColumn) + + // So the code exceeds the line, but we clamp it and avoid a crash + assertEquals("e = \"é\"", unicodeFunc.body?.code) + } + class PythonValueEvaluator : ValueEvaluator() { override fun computeBinaryOpEffect( lhsValue: Any?, diff --git a/cpg-language-python/src/test/resources/python/unicode.py b/cpg-language-python/src/test/resources/python/unicode.py new file mode 100644 index 0000000000..73d35ff23a --- /dev/null +++ b/cpg-language-python/src/test/resources/python/unicode.py @@ -0,0 +1,5 @@ +def normal_func(): + e = "e" + +def unicode_func(): + e = "é"