Fixed crash in getCodeOfSubregion (#1776)

Fraunhofer-AISEC · Oct 8, 2024 · 9755f71 · 9755f71
1 parent 0187754
commit 9755f71
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 1 deletion.
diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/RegionUtils.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/RegionUtils.kt
@@ -27,6 +27,7 @@ package de.fraunhofer.aisec.cpg.helpers
 
 import de.fraunhofer.aisec.cpg.frontends.LanguageFrontend
 import de.fraunhofer.aisec.cpg.sarif.Region
+import kotlin.math.min
 import org.apache.commons.lang3.StringUtils
 
 /**
@@ -62,12 +63,17 @@ fun getCodeOfSubregion(code: String, nodeRegion: Region, subRegion: Region): Str
             (StringUtils.ordinalIndexOf(code, nlType, subRegion.startLine - nodeRegion.startLine) +
                 subRegion.startColumn)
         }
-    val end =
+    var end =
         if (subRegion.endLine == nodeRegion.startLine) {
             subRegion.endColumn - nodeRegion.startColumn
         } else {
             (StringUtils.ordinalIndexOf(code, nlType, subRegion.endLine - nodeRegion.startLine) +
                 subRegion.endColumn)
         }
+
+    // Unfortunately, we sometimes have issues with (non)-Unicode characters in code, where the
+    // python AST thinks that multiple characters are needed and reports a position that is actually
+    // beyond our "end"
+    end = min(end, code.length)
     return code.substring(start, end)
 }
diff --git a/...age-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt b/...age-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt
@@ -1453,6 +1453,33 @@ class PythonFrontendTest : BaseTest() {
         assertEquals(4.toLong(), rhs.evaluate())
     }
 
+    @Test
+    fun testParseWithUnicode() {
+        val topLevel = Path.of("src", "test", "resources", "python")
+        val tu =
+            analyzeAndGetFirstTU(listOf(topLevel.resolve("unicode.py").toFile()), topLevel, true) {
+                it.registerLanguage<PythonLanguage>()
+            }
+        assertNotNull(tu)
+
+        val normalFunc = tu.functions["normal_func"]
+        assertNotNull(normalFunc)
+        // 11 chars (including whitespace) -> SARIF position = 12
+        //     e = "e"
+        assertEquals(12, normalFunc.body?.location?.region?.endColumn)
+
+        val unicodeFunc = tu.functions["unicode_func"]
+        assertNotNull(unicodeFunc)
+
+        // also 11 chars (including whitespace) -> SARIF position = 12
+        // But the python parser somehow sees these as two bytes so the position is 13 :(
+        //     e = "é"
+        assertEquals(13, unicodeFunc.body?.location?.region?.endColumn)
+
+        // So the code exceeds the line, but we clamp it and avoid a crash
+        assertEquals("e = \"é\"", unicodeFunc.body?.code)
+    }
+
     class PythonValueEvaluator : ValueEvaluator() {
         override fun computeBinaryOpEffect(
             lhsValue: Any?,

diff --git a/cpg-language-python/src/test/resources/python/unicode.py b/cpg-language-python/src/test/resources/python/unicode.py
@@ -0,0 +1,5 @@
+def normal_func():
+    e = "e"
+
+def unicode_func():
+    e = "é"