From 572f9923b012f81534c2791eff1b7534152b6a2e Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Wed, 2 Oct 2024 22:52:07 +0200 Subject: [PATCH 1/3] Rename `findSymbols` into `lookupSymbolByName` This PR renames `findSymbols` into `lookupSymbolByName` as a more appropriate name, because it lookups a symbol by its name. Fixes #1767 --- .../de/fraunhofer/aisec/cpg/ScopeManager.kt | 25 +++++++++++-------- .../aisec/cpg/passes/ImportResolver.kt | 5 ++-- .../aisec/cpg/passes/SymbolResolver.kt | 6 ++--- .../aisec/cpg/passes/TypeResolver.kt | 2 +- .../cpg/passes/scopes/ScopeManagerTest.kt | 2 +- .../aisec/cpg/passes/GoExtraPass.kt | 2 +- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt index 72d09e4d6a..6ac7477421 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt @@ -899,7 +899,7 @@ class ScopeManager : ScopeProvider { * @return the declaration, or null if it does not exist */ fun getRecordForName(name: Name): RecordDeclaration? { - return findSymbols(name).filterIsInstance().singleOrNull() + return lookupSymbolByName(name).filterIsInstance().singleOrNull() } fun typedefFor(alias: Name, scope: Scope? = currentScope): Type? { @@ -960,16 +960,21 @@ class ScopeManager : ScopeProvider { get() = currentScope /** - * This function tries to resolve a [Node.name] to a list of symbols (a symbol represented by a - * [Declaration]) starting with [startScope]. This function can return a list of multiple - * symbols in order to check for things like function overloading. but it will only return list - * of symbols within the same scope; the list cannot be spread across different scopes. + * This function tries to convert a [Node.name] into a [Symbol] and then performs a lookup of + * this symbol. This can either be an "unqualified lookup" if [name] is not qualified or a + * "qualified lookup" if [Name.isQualified] is true. In the unqualified case the lookup starts + * in [startScope], in the qualified case we use [extractScope] to find the appropriate scope + * and need to restrict our search to this particular scope. * - * This means that as soon one or more symbols are found in a "local" scope, these shadow all - * other occurrences of the same / symbol in a "higher" scope and only the ones from the lower - * ones will be returned. + * This function can return a list of multiple declarations in order to check for things like + * function overloading. but it will only return list of declarations within the same scope; the + * list cannot be spread across different scopes. + * + * This means that as soon one or more declarations for the symbol are found in a "local" scope, + * these shadow all other occurrences of the same / symbol in a "higher" scope and only the ones + * from the lower ones will be returned. */ - fun findSymbols( + fun lookupSymbolByName( name: Name, location: PhysicalLocation? = null, startScope: Scope? = currentScope, @@ -1112,7 +1117,7 @@ data class CallResolutionResult( /** * A set of candidate symbols we discovered based on the [CallExpression.callee] (using - * [ScopeManager.findSymbols]), more specifically a list of [FunctionDeclaration] nodes. + * [ScopeManager.lookupSymbolByName]), more specifically a list of [FunctionDeclaration] nodes. */ var candidateFunctions: Set, diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/ImportResolver.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/ImportResolver.kt index eaa0b92981..5227afa584 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/ImportResolver.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/ImportResolver.kt @@ -58,7 +58,7 @@ class ImportResolver(ctx: TranslationContext) : ComponentPass(ctx) { // Let's do some importing. We need to import either a wildcard if (node.wildcardImport) { - val list = scopeManager.findSymbols(node.import, node.location, scope) + val list = scopeManager.lookupSymbolByName(node.import, node.location, scope) val symbol = list.singleOrNull() if (symbol != null) { // In this case, the symbol must point to a name scope @@ -69,7 +69,8 @@ class ImportResolver(ctx: TranslationContext) : ComponentPass(ctx) { } } else { // or a symbol directly - val list = scopeManager.findSymbols(node.import, node.location, scope).toMutableList() + val list = + scopeManager.lookupSymbolByName(node.import, node.location, scope).toMutableList() node.importedSymbols = mutableMapOf(node.symbol to list) } } diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt index ef5f1cb8e0..c1572a84dd 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt @@ -178,7 +178,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { // Find a list of candidate symbols. Currently, this is only used the in the "next-gen" call // resolution, but in future this will also be used in resolving regular references. - current.candidates = scopeManager.findSymbols(current.name, current.location).toSet() + current.candidates = scopeManager.lookupSymbolByName(current.name, current.location).toSet() // Preparation for a future without legacy call resolving. Taking the first candidate is not // ideal since we are running into an issue with function pointers here (see workaround @@ -679,7 +679,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { var candidates = mutableSetOf() val records = possibleContainingTypes.mapNotNull { it.root.recordDeclaration }.toSet() for (record in records) { - candidates.addAll(ctx.scopeManager.findSymbols(record.name.fqn(symbol))) + candidates.addAll(ctx.scopeManager.lookupSymbolByName(record.name.fqn(symbol))) } // Find invokes by supertypes @@ -845,7 +845,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { listOf() } else { val firstLevelCandidates = - possibleTypes.map { scopeManager.findSymbols(it.name.fqn(name)) }.flatten() + possibleTypes.map { scopeManager.lookupSymbolByName(it.name.fqn(name)) }.flatten() // C++ does not allow overloading at different hierarchy levels. If we find a // FunctionDeclaration with the same name as the function in the CallExpression we have diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/TypeResolver.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/TypeResolver.kt index df57786fd1..b27c5dd7d9 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/TypeResolver.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/TypeResolver.kt @@ -67,7 +67,7 @@ open class TypeResolver(ctx: TranslationContext) : ComponentPass(ctx) { // constructor declarations and such with the same name. It seems this is ok since most // languages will prefer structs/classes over functions when resolving types. var symbols = - ctx?.scopeManager?.findSymbols(type.name, startScope = type.scope) { + ctx?.scopeManager?.lookupSymbolByName(type.name, startScope = type.scope) { it is DeclaresType } ?: listOf() diff --git a/cpg-core/src/test/kotlin/de/fraunhofer/aisec/cpg/passes/scopes/ScopeManagerTest.kt b/cpg-core/src/test/kotlin/de/fraunhofer/aisec/cpg/passes/scopes/ScopeManagerTest.kt index a73658f421..3fb62a3bfe 100644 --- a/cpg-core/src/test/kotlin/de/fraunhofer/aisec/cpg/passes/scopes/ScopeManagerTest.kt +++ b/cpg-core/src/test/kotlin/de/fraunhofer/aisec/cpg/passes/scopes/ScopeManagerTest.kt @@ -100,7 +100,7 @@ internal class ScopeManagerTest : BaseTest() { // resolve symbol val call = frontend.newCallExpression(frontend.newReference("A::func1"), "A::func1", false) - val func = final.findSymbols(call.callee!!.name).firstOrNull() + val func = final.lookupSymbolByName(call.callee!!.name).firstOrNull() assertEquals(func1, func) } diff --git a/cpg-language-go/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/GoExtraPass.kt b/cpg-language-go/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/GoExtraPass.kt index 2df5eeb357..5297e13826 100644 --- a/cpg-language-go/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/GoExtraPass.kt +++ b/cpg-language-go/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/GoExtraPass.kt @@ -403,7 +403,7 @@ class GoExtraPass(ctx: TranslationContext) : ComponentPass(ctx) { // Try to see if we already know about this namespace somehow val namespace = - scopeManager.findSymbols(import.name, null).filter { + scopeManager.lookupSymbolByName(import.name, null).filter { it is NamespaceDeclaration && it.path == import.importURL } From 3e766f5457af810e68fc890984e4277005a5195d Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Thu, 3 Oct 2024 09:36:43 +0200 Subject: [PATCH 2/3] Update cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt Co-authored-by: KuechA <31155350+KuechA@users.noreply.github.com> --- .../src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt index 6ac7477421..f45809b3a5 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/ScopeManager.kt @@ -967,7 +967,7 @@ class ScopeManager : ScopeProvider { * and need to restrict our search to this particular scope. * * This function can return a list of multiple declarations in order to check for things like - * function overloading. but it will only return list of declarations within the same scope; the + * function overloading. But it will only return list of declarations within the same scope; the * list cannot be spread across different scopes. * * This means that as soon one or more declarations for the symbol are found in a "local" scope, From a84f369f03461a06526f8576715aac720cb49775 Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Thu, 3 Oct 2024 09:39:39 +0200 Subject: [PATCH 3/3] Added documentation --- docs/docs/CPG/impl/index.md | 1 + docs/docs/CPG/impl/scopes.md | 97 ++++++++++++++++++++++++++- docs/docs/CPG/impl/symbol-resolver.md | 38 +++++++++++ docs/mkdocs.yaml | 3 +- 4 files changed, 135 insertions(+), 4 deletions(-) create mode 100644 docs/docs/CPG/impl/symbol-resolver.md diff --git a/docs/docs/CPG/impl/index.md b/docs/docs/CPG/impl/index.md index 9359f148d5..dc0b49f7db 100755 --- a/docs/docs/CPG/impl/index.md +++ b/docs/docs/CPG/impl/index.md @@ -24,3 +24,4 @@ the graph. These two stages are strictly separated one from each other. * [Languages and Language Frontends](./language) * [Scopes](./scopes) * [Passes](./passes) +* [Symbol Resolution](./symbol-resolver.md) diff --git a/docs/docs/CPG/impl/scopes.md b/docs/docs/CPG/impl/scopes.md index 9fafc1ea83..c5ab985104 100755 --- a/docs/docs/CPG/impl/scopes.md +++ b/docs/docs/CPG/impl/scopes.md @@ -1,6 +1,6 @@ --- -title: "Implementation and Concepts - Scopes" -linkTitle: "Implementation and Concepts - Scopes" +title: "Implementation and Concepts - Scopes and Symbols" +linkTitle: "Implementation and Concepts - Scopes and Symbols" weight: 20 no_list: false menu: @@ -11,5 +11,96 @@ description: > --- -# Implementation and Concepts: Scopes and Scope Manger +# Scopes and Symbols +The concept of scopes and symbols are at the heart of every programming language and thus are also the core of static analysis. Both concepts consist in the CPG library through the types `Scope` and `Symbol` respectively. + +A "symbol" can be seen as an identifier in most programming languages, referring to variables or functions. Symbols are often grouped in scopes, which defines the visibility of a symbol, e.g. a slice of a program that can "see" the symbol. Often this is also synonymous with the life-time of a variable, e.g., that its memory will be freed (or collected by a garbage collector) once it goes "out of scope". + +```c +// This defines a symbol "a" in the global/file scope. +// Its visibility is global within the file. +int a = 1; + +int main() { + // this defines another symbol "a" in a function/block scope. + // Its visibility is limited to the block it is defined in. + int a = 1; +} +``` + +Usually symbols declared in a local scope override the declaration of a symbol in a higher (e.g., global scope), which is also referred to as "shadowing". This needs to be taken into account when resolving symbols to their declarations. + +The `Scope` class holds all its symbols in the `Scope::symbols` property. More specifically, this property is a `SymbolMap`, which is a type alias to a map, whose key type is a `Symbol` and whose value type is a list of `Declaration` nodes. This is basically a symbol lookup table for all symbols in its scope. It is a map of a list because some programming languages have concepts like function overloading, which leads to the declaration of multiple `FunctionDeclaration` nodes under the same symbol in one scope. In the current implementation, a `Symbol` is just a typealias for a string, and it is always "local" to the scope, meaning that it MUST NOT contain any qualifier. If you want to refer to a fully qualified identifier, a `Name` must be used. In the future, we might consider merging the concepts of `Symbol` and `Name`. + +For a frontend or pass developer, the main interaction point with scopes and symbols is through the `ScopeManager`. The scope manager is available to all nodes via the `TranslationContext` and also injected in frontend, handlers and passes. + +## Hierarchy of Scopes + +Each scope (except the `GlobalScope`) can have a parent and possible child scopes. This can be used to model a hierarchy of scopes within a program. For example using the snippet above, the following scopes are defined in the CPG: + +* A `GlobalScope` that comprises the whole file +* A `FunctionScope` that comprises the function `main` +* A `BlockScope` that comprises the function body + +Note, that each programming language is different when it comes to scoping and this needs to be thought of by a frontend developer. For example in C/C++ each block introduced by `{}` introduces a new scope and variables can be declared only for such a block, meaning that each `for`, `if` and other statements also introduce a new scope. In contrast, Python only differentiates between a global scope, function and class scope. + +## Defining Scopes and Declaring Symbols + +In order to define new scopes, the `ScopeManager` offers two main APIs: + +* `enterScope(node)`, which specifies that `node` will declare a new scope and that an appropriate `Scope` (or derived type) will be created +* `leaveScope(node)`, which closes the scope again + +It is important that every opened scope must also be closed again. When scopes are nested, they also need to be closed in reverse order. + +```Kotlin +// We are inside the global scope here and want to create a new function +var func = newFunctionDeclaration("main") + +// Create a function scope +scopeManager.enterScope(func) + +// Create a block scope for the body because our language works this way +var body = newBlock() +func.body = body +scopeManager.enterScope(body) + +// Add statements here +body.statements += /* ... */ + +// Leave block scope +scopeManager.leaveScope(body) + +// Back to global scope, add the function to global scope +scopeManager.leaveScope(func) +scopeManager.addDeclaration(func) +``` + +Inside the scope, declarations can be added with `ScopeManager::addDeclaration`. This takes care of adding the declaration to an appropriate place in the AST (which beyond the scope of this document) and also adds the `Declaration` to the `Scope` under the appropriate `Symbol`. + + +## Looking up Symbols + +During different analysis steps, e.g., in different passes, we want to find certain symbols or lookup the declaration(s) belonging to a particular symbol. There are two functions in order to do so - a "higher" level concept in the `ScopeManager` and a "lower" level function on the `Scope` itself. + +The lower level one is called `Scope::lookupSymbol` and can be used to retrieve a list of `Declaration` nodes that belong to a particular `Symbol` that is "visible" the scope. It does so by first looking through its own `Scope::symbols`. If no match was found, the scope is traversed upwards to its `Scope::parent`, until a match is found. Furthermore, additional logic is needed to resolve symbol that are pointing to another scope, e.g., because they represent an `ImportDeclaration`. + +```Kotlin +var scope = /* ... */ +var declarations = scope.lookupSymbol("a") { + // Some additional predicate if we want +} +``` + +Additionally, the lookup can be fine-tuned by an additional predicate. However, this should be used carefully as it restricts the possible list of symbols very early. In most cases the list of symbols should be quite exhaustive at first to find all possible candidates and then selecting the best candidate in a second step (e.g., based on argument types for a function call). + +While the aforementioned API works great if we already have a specific start scope and local `Symbol`, we often start our resolution process with a `Name` -- which could potentially be qualified, such as `std::string`. Therefore, the "higher level" function `ScopeManager::lookupSymbolByName` can be used to retrieve a list of candidate declarations by a given `Name`. In a first step, the name is checked for a potential scope qualifier (`std` in this example). If present, it is extracted and the search scope is set to it. This is what is usually referred to as a "qualified lookup". Otherwise, the local part of the name is used to start the lookup, in what is called an "unqualified lookup". In both cases, the actual lookup is delegated to `ScopeManager::lookupSymbols`, but with different parameters. + +```Kotlin +var name = parseName("std::string") +// This will return all the 'string' symbols within the 'std' name scope +var stringSymbols = scopeManager.lookupSymbolByName(name) +``` + +Developers should avoid symbol lookup during frontend parsing, since often during parsing, only a limited view of all symbols is available. Instead, a dedicated pass that is run on the complete translation result is the preferred option. Apart from that, the main usage of this API is in the [SymbolResolver](symbol-resolver.md). \ No newline at end of file diff --git a/docs/docs/CPG/impl/symbol-resolver.md b/docs/docs/CPG/impl/symbol-resolver.md new file mode 100644 index 0000000000..c052e6b69f --- /dev/null +++ b/docs/docs/CPG/impl/symbol-resolver.md @@ -0,0 +1,38 @@ +--- +title: "Implementation and Concepts - Symbol Resolution" +linkTitle: "Implementation and Concepts - Symbol Resolution" +weight: 20 +no_list: false +menu: + main: + weight: 20 +description: > + The CPG library is a language-agnostic graph representation of source code. +--- + + +# Symbol Resolution + +This pages describes the main functionality behind symbol resolution in the CPG library. This is mostly done by the `SymbolResolver` pass, in combination with the symbol lookup API (see [Scopes and Symbols](scopes.md#looking-up-symbols)). In addition to the *lookup* of a symbol, the *resolution* takes the input of the lookup and provides a "definite" decision which symbol is used. This mostly referred to symbols / names used in a `Reference` or a `CallExpression` (which also has a reference as its `CallExpression::callee`). + +## The `SymbolResolver` Pass + +The `SymbolResolver` pass takes care of the heavy lifting of symbol (or rather reference) resolving: + +* It sets the `Reference::refersTo` property, +* and sets the `CallExpression::invokes` property, +* and finally takes cares of operator overloading (if the language supports it). + +In a way, it can be compared to a linker step in a compiler. The pass operates on a single `Component` and starts by identifying EOG starter nodes within the component. These node "start" an EOG sub-graph, i.e., they do not have any previous EOG edges. The symbol resolver uses the `ScopedWalker` with a special set-up that traverses the EOG starting with each EOG starter node until it reaches the end. This ensures that symbols are resolved in the correct order of "evaluation", e.g., that a base of a member expression is resolved before the expression itself. This ensures that necessary type information on the base are available in order to resolve appropriate fields of the member expression. + +The symbol resolver itself has gone through many re-writes over the years and there is still some code left that we consider *legacy*. These functions are marked as such, and we aim to remove them slowly. + +## Resolving References + +The main functionality lies in `ScopeManager::handleReference`. For all `Reference` nodes (that are not `MemberExpression` nodes) we use the symbol lookup API to find declaration candidates for the name the reference is referring to. This candidate list is then stored in `Reference::candidates`. If the reference is the `CallExpression::callee` property of a call, we abort here and jump to [Resolve Calls](#resolve-calls). + +Otherwise, we currently take the first entry of the candidate list and set the `Reference::refersTo` property to it. + +## Resolve Calls + +Prequisite: The `CallExpression::callee` reference must have been resolved (see [Resolving References](#resolving-references)). \ No newline at end of file diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index b6a535732d..e73c12e752 100755 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -167,8 +167,9 @@ nav: - "Implementation": - CPG/impl/index.md - "Language Frontends": CPG/impl/language.md - - "Scopes": CPG/impl/scopes.md + - "Scopes and Symbols": CPG/impl/scopes.md - "Passes": CPG/impl/passes.md + - "Symbol Resolution": CPG/impl/symbol-resolver.md - "Contributing": - "Contributing to the CPG library": Contributing/index.md # This assumes that the most recent dokka build was generated with the "main" tag!