From 5b5814e45f7d22dbd7f73d11a1de50833aa68551 Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Fri, 20 Sep 2024 20:22:49 +0200 Subject: [PATCH 1/3] Changing function signature of `parse` to accept the file content instead of a file This PR changes the way `parse` works (in a backwards compatible way). Instead of parsing a `File`, we parse the file contents (and a path). The reasoning behind this is that almost all language frontends currently need to read the file contents and we can harmonize this. This will also allow us to provide more common statistics about the parsing context in the future. --- .../aisec/cpg/frontends/SupportsNewParse.kt | 38 +++++++++++++++ .../aisec/cpg/sarif/PhysicalLocation.kt | 18 +++---- .../python/PythonLanguageFrontend.kt | 48 +++++++++++++------ .../frontends/python/PythonFrontendTest.kt | 30 ++++++++++++ 4 files changed, 110 insertions(+), 24 deletions(-) create mode 100644 cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/SupportsNewParse.kt diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/SupportsNewParse.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/SupportsNewParse.kt new file mode 100644 index 0000000000..7bb3538062 --- /dev/null +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/SupportsNewParse.kt @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, Fraunhofer AISEC. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $$$$$$\ $$$$$$$\ $$$$$$\ + * $$ __$$\ $$ __$$\ $$ __$$\ + * $$ / \__|$$ | $$ |$$ / \__| + * $$ | $$$$$$$ |$$ |$$$$\ + * $$ | $$ ____/ $$ |\_$$ | + * $$ | $$\ $$ | $$ | $$ | + * \$$$$$ |$$ | \$$$$$ | + * \______/ \__| \______/ + * + */ +package de.fraunhofer.aisec.cpg.frontends + +import de.fraunhofer.aisec.cpg.graph.declarations.TranslationUnitDeclaration +import java.nio.file.Path + +interface SupportsNewParse { + /** + * Parses the given [content] with the language frontend into a [TranslationUnitDeclaration]. If + * known, a [path] should be specified, so that the language frontend can potentially use more + * advanced features like module resolution. + */ + fun parse(content: String, path: Path? = null): TranslationUnitDeclaration +} diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/sarif/PhysicalLocation.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/sarif/PhysicalLocation.kt index 3fca276c8f..3f0dd83f07 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/sarif/PhysicalLocation.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/sarif/PhysicalLocation.kt @@ -29,11 +29,15 @@ import java.net.URI import java.util.* /** A SARIF compatible location referring to a location, i.e. file and region within the file. */ -class PhysicalLocation(uri: URI, region: Region) { - class ArtifactLocation(val uri: URI) { +class PhysicalLocation(uri: URI?, region: Region) { + class ArtifactLocation(val uri: URI?) { override fun toString(): String { - return uri.path.substring(uri.path.lastIndexOf('/') + 1) + return if (uri != null) { + uri.path + } else { + "unknown" + } } override fun equals(other: Any?): Boolean { @@ -45,7 +49,7 @@ class PhysicalLocation(uri: URI, region: Region) { override fun hashCode() = Objects.hashCode(uri) } - val artifactLocation: ArtifactLocation + var artifactLocation: ArtifactLocation var region: Region init { @@ -68,11 +72,7 @@ class PhysicalLocation(uri: URI, region: Region) { companion object { fun locationLink(location: PhysicalLocation?): String { return if (location != null) { - (location.artifactLocation.uri.path + - ":" + - location.region.startLine + - ":" + - location.region.startColumn) + "${location.artifactLocation}:${location.region.startLine}:${location.region.startColumn}" } else "unknown" } } diff --git a/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt b/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt index 029981cbe1..7c026ff997 100644 --- a/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt +++ b/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt @@ -28,6 +28,7 @@ package de.fraunhofer.aisec.cpg.frontends.python import de.fraunhofer.aisec.cpg.TranslationContext import de.fraunhofer.aisec.cpg.frontends.Language import de.fraunhofer.aisec.cpg.frontends.LanguageFrontend +import de.fraunhofer.aisec.cpg.frontends.SupportsNewParse import de.fraunhofer.aisec.cpg.frontends.TranslationException import de.fraunhofer.aisec.cpg.graph.* import de.fraunhofer.aisec.cpg.graph.declarations.TranslationUnitDeclaration @@ -39,15 +40,16 @@ import de.fraunhofer.aisec.cpg.passes.configuration.RegisterExtraPass import de.fraunhofer.aisec.cpg.sarif.PhysicalLocation import de.fraunhofer.aisec.cpg.sarif.Region import java.io.File -import java.net.URI +import java.nio.file.Path import jep.python.PyObject -import kotlin.io.path.Path +import kotlin.io.path.absolute +import kotlin.io.path.name import kotlin.io.path.nameWithoutExtension import kotlin.math.min @RegisterExtraPass(PythonAddDeclarationsPass::class) class PythonLanguageFrontend(language: Language, ctx: TranslationContext) : - LanguageFrontend(language, ctx) { + LanguageFrontend(language, ctx), SupportsNewParse { private val lineSeparator = '\n' // TODO private val tokenTypeIndex = 0 private val jep = JepSingleton // configure Jep @@ -62,21 +64,27 @@ class PythonLanguageFrontend(language: Language, ctx: Tr * new [PythonLanguageFrontend] instance per file. */ private lateinit var fileContent: String - private lateinit var uri: URI + private var filePath: Path? = null - @Throws(TranslationException::class) - override fun parse(file: File): TranslationUnitDeclaration { - fileContent = file.readText(Charsets.UTF_8) - uri = file.toURI() + override fun parse(content: String, path: Path?): TranslationUnitDeclaration { + this.fileContent = content + this.filePath = path jep.getInterp().use { - it.set("content", fileContent) - it.set("filename", file.absolutePath) + it.set("content", content) + it.set( + "filename", + if (path != null) { + path.absolute().toString() + } else { + "" + } + ) it.exec("import ast") it.exec("parsed = ast.parse(content, filename=filename, type_comments=True)") val pyAST = it.getValue("parsed") as PyObject - val tud = pythonASTtoCPG(pyAST, file.name) + val tud = pythonASTtoCPG(pyAST, path) if (config.matchCommentsToNodes) { it.exec("import tokenize") @@ -97,6 +105,11 @@ class PythonLanguageFrontend(language: Language, ctx: Tr } } + @Throws(TranslationException::class) + override fun parse(file: File): TranslationUnitDeclaration { + return parse(file.readText(Charsets.UTF_8), file.toPath()) + } + private fun addCommentsToCPG( tud: TranslationUnitDeclaration, pyTokens: ArrayList<*>, @@ -236,7 +249,7 @@ class PythonLanguageFrontend(language: Language, ctx: Tr override fun locationOf(astNode: Python.AST.AST): PhysicalLocation? { return if (astNode is Python.AST.WithLocation) { PhysicalLocation( - uri, + filePath?.toUri(), Region( startLine = astNode.lineno, endLine = astNode.end_lineno, @@ -253,17 +266,22 @@ class PythonLanguageFrontend(language: Language, ctx: Tr // will be invoked by native function } - private fun pythonASTtoCPG(pyAST: PyObject, path: String): TranslationUnitDeclaration { + private fun pythonASTtoCPG(pyAST: PyObject, path: Path?): TranslationUnitDeclaration { val pythonASTModule = fromPython(pyAST) as? Python.AST.Module ?: TODO( "Python ast of type ${fromPython(pyAST).javaClass} is not supported yet" ) // could be one of "ast.{Module,Interactive,Expression,FunctionType} - val tud = newTranslationUnitDeclaration(path, rawNode = pythonASTModule) + val tud = newTranslationUnitDeclaration(path?.name, rawNode = pythonASTModule) scopeManager.resetToGlobal(tud) - val nsdName = Path(path).nameWithoutExtension + val nsdName = + if (path != null) { + path.nameWithoutExtension + } else { + "unknown" + } val nsd = newNamespaceDeclaration(nsdName, rawNode = pythonASTModule) tud.addDeclaration(nsd) diff --git a/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt b/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt index 5d63e06d6d..c1a427c7cf 100644 --- a/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt +++ b/cpg-language-python/src/test/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonFrontendTest.kt @@ -25,6 +25,10 @@ */ package de.fraunhofer.aisec.cpg.frontends.python +import de.fraunhofer.aisec.cpg.ScopeManager +import de.fraunhofer.aisec.cpg.TranslationConfiguration +import de.fraunhofer.aisec.cpg.TranslationContext +import de.fraunhofer.aisec.cpg.TypeManager import de.fraunhofer.aisec.cpg.analysis.ValueEvaluator import de.fraunhofer.aisec.cpg.graph.* import de.fraunhofer.aisec.cpg.graph.Annotation @@ -1345,6 +1349,32 @@ class PythonFrontendTest : BaseTest() { assertEquals(4.toLong(), rhs.evaluate()) } + @Test + fun testParseContent() { + var frontend = + PythonLanguageFrontend( + language = PythonLanguage(), + ctx = + TranslationContext( + TranslationConfiguration.builder().build(), + ScopeManager(), + TypeManager() + ) + ) + + val tu = frontend.parse("a = 4\nprint(a)") + assertNotNull(tu) + + val unknown = tu.namespaces["unknown"] + assertNotNull(unknown) + + val refNames = tu.refs.map { it.name.localName } + assertEquals(listOf("a", "a", "print"), refNames) + + val call = tu.calls["print"] + assertNotNull(call) + } + class PythonValueEvaluator : ValueEvaluator() { override fun computeBinaryOpEffect( lhsValue: Any?, From eee60c28575a73a6e924d075fcb95650a06e2b2b Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Fri, 20 Sep 2024 22:18:32 +0200 Subject: [PATCH 2/3] Display time / LoC at the end --- .../aisec/cpg/TranslationManager.kt | 52 +++++++++++++++++-- .../fraunhofer/aisec/cpg/TranslationResult.kt | 2 + .../fraunhofer/aisec/cpg/TranslationStats.kt | 39 ++++++++++++++ .../aisec/cpg/helpers/MeasurementHolder.kt | 8 ++- 4 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationStats.kt diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationManager.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationManager.kt index 199f03fe91..98cee4d6e2 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationManager.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationManager.kt @@ -27,6 +27,7 @@ package de.fraunhofer.aisec.cpg import de.fraunhofer.aisec.cpg.frontends.Language import de.fraunhofer.aisec.cpg.frontends.LanguageFrontend +import de.fraunhofer.aisec.cpg.frontends.SupportsNewParse import de.fraunhofer.aisec.cpg.frontends.SupportsParallelParsing import de.fraunhofer.aisec.cpg.frontends.TranslationException import de.fraunhofer.aisec.cpg.graph.Component @@ -43,7 +44,10 @@ import java.util.concurrent.CompletableFuture import java.util.concurrent.CompletionException import java.util.concurrent.ExecutionException import java.util.concurrent.atomic.AtomicBoolean +import kotlin.io.path.absolute +import kotlin.io.path.readText import kotlin.reflect.full.findAnnotation +import kotlin.time.DurationUnit import org.slf4j.LoggerFactory /** Main entry point for all source code translation for all language front-ends. */ @@ -116,6 +120,15 @@ private constructor( } } + log.info( + "Translated {} LoC in total ({} / LoC)", + result.stats.totalLinesOfCode, + (outerBench.duration / result.stats.totalLinesOfCode).toString( + DurationUnit.MILLISECONDS, + decimals = 3 + ) + ) + return result } @@ -276,7 +289,7 @@ private constructor( val future = CompletableFuture.supplyAsync { try { - return@supplyAsync parse(component, ctx, sourceLocation) + return@supplyAsync parse(component, result, ctx, sourceLocation) } catch (e: TranslationException) { throw RuntimeException("Error parsing $sourceLocation", e) } @@ -337,7 +350,7 @@ private constructor( for (sourceLocation in sourceLocations) { ctx.currentComponent = component - val f = parse(component, ctx, sourceLocation) + val f = parse(component, result, ctx, sourceLocation) if (f != null) { handleCompletion(result, usedFrontends, sourceLocation, f) } @@ -365,6 +378,7 @@ private constructor( @Throws(TranslationException::class) private fun parse( component: Component, + result: TranslationResult, ctx: TranslationContext, sourceLocation: File, ): LanguageFrontend<*, *>? { @@ -384,7 +398,30 @@ private constructor( } return null } - component.addTranslationUnit(frontend.parse(sourceLocation)) + + // Check, if the frontend supports the new API + var tu = + if (frontend is SupportsNewParse) { + // Read the file contents and supply it to the frontend. This gives us a chance + // to do some statistics here, for example on the lines of code. For now, we + // just print it, in a future PR we will gather this information and consolidate + // it. + var path = sourceLocation.toPath().absolute() + var content = path.readText() + var linesOfCode = content.linesOfCode + + log.info("{} has {} LoC", path, linesOfCode) + + var tu = frontend.parse(content, path) + + // Add the LoC. This needs to be synchronized on the stats object, because of + // parallel parsing + synchronized(result.stats) { result.stats.totalLinesOfCode += linesOfCode } + tu + } else { + frontend.parse(sourceLocation) + } + component.addTranslationUnit(tu) } catch (ex: TranslationException) { log.error("An error occurred during parsing of ${sourceLocation.name}: ${ex.message}") if (config.failOnError) { @@ -462,3 +499,12 @@ private constructor( } } } + +/** + * This returns a VERY trivial count of the lines of code (mainly just the line count). This can be + * extended to a real LoC algorithm at some point. + */ +val String.linesOfCode: Int + get() { + return this.count { it == '\n' } + } diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationResult.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationResult.kt index 05ace1a4dc..675122ca57 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationResult.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationResult.kt @@ -85,6 +85,8 @@ class TranslationResult( return finalCtx } + var stats = TranslationStats() + /** * Checks if only a single software component has been analyzed and returns its translation * units. For multiple software components, it aggregates the results. diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationStats.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationStats.kt new file mode 100644 index 0000000000..a97cf11be7 --- /dev/null +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/TranslationStats.kt @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2024, Fraunhofer AISEC. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $$$$$$\ $$$$$$$\ $$$$$$\ + * $$ __$$\ $$ __$$\ $$ __$$\ + * $$ / \__|$$ | $$ |$$ / \__| + * $$ | $$$$$$$ |$$ |$$$$\ + * $$ | $$ ____/ $$ |\_$$ | + * $$ | $$\ $$ | $$ | $$ | + * \$$$$$ |$$ | \$$$$$ | + * \______/ \__| \______/ + * + */ +package de.fraunhofer.aisec.cpg + +import de.fraunhofer.aisec.cpg.helpers.MeasurementHolder +import de.fraunhofer.aisec.cpg.helpers.StatisticsHolder + +/** + * This class provides some statistics about our translation process. At some point this will fully + * replace [StatisticsHolder] and [MeasurementHolder] + */ +class TranslationStats { + + /** The total lines of code that were translated into the CPG. */ + var totalLinesOfCode: Int = 0 +} diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/MeasurementHolder.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/MeasurementHolder.kt index f16b284cdc..5e5d76091c 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/MeasurementHolder.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/helpers/MeasurementHolder.kt @@ -33,6 +33,8 @@ import java.nio.file.Path import java.time.Duration import java.time.Instant import java.util.* +import kotlin.time.DurationUnit +import kotlin.time.toDuration import org.slf4j.Logger import org.slf4j.LoggerFactory @@ -146,6 +148,7 @@ constructor( ) : MeasurementHolder(c, message, debug, holder) { private val start: Instant + var duration: kotlin.time.Duration = kotlin.time.Duration.ZERO /** Stops this benchmark and adds its measurement to the its [StatisticsHolder]. */ fun stop() { @@ -154,7 +157,7 @@ constructor( /** Stops the time and computes the difference between */ override fun addMeasurement(measurementKey: String?, measurementValue: String?): Any? { - val duration = Duration.between(start, Instant.now()).toMillis() + var duration = Duration.between(start, Instant.now()).toMillis() measurements["${caller}: $message"] = "$duration ms" logDebugMsg("$caller: $message done in $duration ms") @@ -162,6 +165,9 @@ constructor( // update our holder, if we have any holder?.addBenchmark(this) + // update our internal duration so that others can access it + this.duration = duration.toDuration(DurationUnit.MILLISECONDS) + return duration } From 416ffed3de28b6874b9375b27161129aaa469058 Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Sat, 21 Sep 2024 09:42:45 +0200 Subject: [PATCH 3/3] Supporting ruby --- .../frontends/python/PythonLanguageFrontend.kt | 10 +++++----- .../cpg/frontends/ruby/RubyLanguageFrontend.kt | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt b/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt index 7c026ff997..35473d229d 100644 --- a/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt +++ b/cpg-language-python/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/python/PythonLanguageFrontend.kt @@ -66,6 +66,11 @@ class PythonLanguageFrontend(language: Language, ctx: Tr private lateinit var fileContent: String private var filePath: Path? = null + @Throws(TranslationException::class) + override fun parse(file: File): TranslationUnitDeclaration { + return parse(file.readText(Charsets.UTF_8), file.toPath()) + } + override fun parse(content: String, path: Path?): TranslationUnitDeclaration { this.fileContent = content this.filePath = path @@ -105,11 +110,6 @@ class PythonLanguageFrontend(language: Language, ctx: Tr } } - @Throws(TranslationException::class) - override fun parse(file: File): TranslationUnitDeclaration { - return parse(file.readText(Charsets.UTF_8), file.toPath()) - } - private fun addCommentsToCPG( tud: TranslationUnitDeclaration, pyTokens: ArrayList<*>, diff --git a/cpg-language-ruby/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/ruby/RubyLanguageFrontend.kt b/cpg-language-ruby/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/ruby/RubyLanguageFrontend.kt index 42d1ccb6c3..129d495d46 100644 --- a/cpg-language-ruby/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/ruby/RubyLanguageFrontend.kt +++ b/cpg-language-ruby/src/main/kotlin/de/fraunhofer/aisec/cpg/frontends/ruby/RubyLanguageFrontend.kt @@ -27,11 +27,13 @@ package de.fraunhofer.aisec.cpg.frontends.ruby import de.fraunhofer.aisec.cpg.TranslationContext import de.fraunhofer.aisec.cpg.frontends.LanguageFrontend +import de.fraunhofer.aisec.cpg.frontends.SupportsNewParse import de.fraunhofer.aisec.cpg.graph.* import de.fraunhofer.aisec.cpg.graph.declarations.TranslationUnitDeclaration import de.fraunhofer.aisec.cpg.graph.types.Type import de.fraunhofer.aisec.cpg.sarif.PhysicalLocation import java.io.File +import java.nio.file.Path import org.jruby.Ruby import org.jruby.ast.BlockNode import org.jruby.ast.MethodDefNode @@ -40,19 +42,27 @@ import org.jruby.parser.Parser import org.jruby.parser.ParserConfiguration class RubyLanguageFrontend(language: RubyLanguage, ctx: TranslationContext) : - LanguageFrontend(language, ctx) { + LanguageFrontend(language, ctx), SupportsNewParse { val declarationHandler: DeclarationHandler = DeclarationHandler(this) val expressionHandler: ExpressionHandler = ExpressionHandler(this) val statementHandler: StatementHandler = StatementHandler(this) override fun parse(file: File): TranslationUnitDeclaration { + return parse(file.readText(Charsets.UTF_8), file.toPath()) + } + + override fun parse(content: String, path: Path?): TranslationUnitDeclaration { val ruby = Ruby.getGlobalRuntime() val parser = Parser(ruby) val node = parser.parse( - file.path, - file.inputStream(), + if (path != null) { + path.toString() + } else { + "unknown" + }, + content.byteInputStream(), null, ParserConfiguration(ruby, 0, false, true, false) ) as RootNode