From 34fd4b851f6397836358eae729beffdd7bda0ed1 Mon Sep 17 00:00:00 2001 From: Christian Banse Date: Tue, 1 Oct 2024 00:29:59 +0200 Subject: [PATCH] Added docs for symbol resolver --- .../aisec/cpg/passes/SymbolResolver.kt | 36 ++++++++++++++---- docs/docs/CPG/impl/index.md | 1 + docs/docs/CPG/impl/scopes.md | 6 ++- docs/docs/CPG/impl/symbol-resolver.md | 38 +++++++++++++++++++ docs/mkdocs.yaml | 3 +- 5 files changed, 74 insertions(+), 10 deletions(-) create mode 100644 docs/docs/CPG/impl/symbol-resolver.md diff --git a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt index 0032c9eabdb..3a4edeea87f 100644 --- a/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt +++ b/cpg-core/src/main/kotlin/de/fraunhofer/aisec/cpg/passes/SymbolResolver.kt @@ -171,7 +171,7 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { // identifier in Go) if ( language is HasAnonymousIdentifier && - current.name.localName == language.anonymousIdentifier + current.name.localName == language.anonymousIdentifier ) { return } @@ -180,6 +180,11 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { // resolution, but in future this will also be used in resolving regular references. current.candidates = scopeManager.findSymbols(current.name, current.location).toSet() + // Preparation for a future without legacy call resolving. Taking the first candidate is not + // ideal since we are running into an issue with function pointers here (see workaround + // below). + var wouldResolveTo = current.candidates.singleOrNull() + // For now, we need to ignore reference expressions that are directly embedded into call // expressions, because they are the "callee" property. In the future, we will use this // property to actually resolve the function call. However, there is a special case that @@ -189,21 +194,38 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { // of this call expression back to its original variable declaration. In the future, we want // to extend this particular code to resolve all callee references to their declarations, // i.e., their function definitions and get rid of the separate CallResolver. - var wouldResolveTo: Declaration? = null if (current.resolutionHelper is CallExpression) { // Peek into the declaration, and if it is only one declaration and a variable, we can // proceed normally, as we are running into the special case explained above. Otherwise, // we abort here (for now). - wouldResolveTo = current.candidates.singleOrNull() if (wouldResolveTo !is VariableDeclaration && wouldResolveTo !is ParameterDeclaration) { return } } + // Some stupid C++ workaround to use the legacy call resolver when we try to resolve targets + // for function pointers. At least we are only invoking the legacy resolver for a very small + // percentage of references now. + if (wouldResolveTo is FunctionDeclaration) { + // We need to invoke the legacy resolver, just to be sure + var legacy = scopeManager.resolveReference(current) + + // This is just for us to catch these differences in symbol resolving in the future. The + // difference is pretty much only that the legacy system takes parameters of the + // function-pointer-type into account and the new system does not (yet), because it just + // takes the first match. This will be needed to solve in the future. + if (legacy != wouldResolveTo) { + log.warn( + "The legacy symbol resolution and the new system produced different results here. This needs to be investigated in the future. For now, we take the legacy result." + ) + wouldResolveTo = legacy + } + } + // Only consider resolving, if the language frontend did not specify a resolution. If we // already have populated the wouldResolveTo variable, we can re-use this instead of // resolving again - var refersTo = current.refersTo ?: wouldResolveTo ?: scopeManager.resolveReference(current) + var refersTo = current.refersTo ?: wouldResolveTo var recordDeclType: Type? = null if (currentClass != null) { @@ -218,9 +240,9 @@ open class SymbolResolver(ctx: TranslationContext) : ComponentPass(ctx) { // only add new nodes for non-static unknown if ( refersTo == null && - !current.isStaticAccess && - recordDeclType != null && - recordDeclType.recordDeclaration != null + !current.isStaticAccess && + recordDeclType != null && + recordDeclType.recordDeclaration != null ) { // Maybe we are referring to a field instead of a local var val field = resolveMember(recordDeclType, current) diff --git a/docs/docs/CPG/impl/index.md b/docs/docs/CPG/impl/index.md index 9359f148d5d..dc0b49f7db1 100755 --- a/docs/docs/CPG/impl/index.md +++ b/docs/docs/CPG/impl/index.md @@ -24,3 +24,4 @@ the graph. These two stages are strictly separated one from each other. * [Languages and Language Frontends](./language) * [Scopes](./scopes) * [Passes](./passes) +* [Symbol Resolution](./symbol-resolver.md) diff --git a/docs/docs/CPG/impl/scopes.md b/docs/docs/CPG/impl/scopes.md index 7d0f838a1d3..83fa49f1534 100755 --- a/docs/docs/CPG/impl/scopes.md +++ b/docs/docs/CPG/impl/scopes.md @@ -1,6 +1,6 @@ --- -title: "Implementation and Concepts - Scopes" -linkTitle: "Implementation and Concepts - Scopes" +title: "Implementation and Concepts - Scopes and Symbols" +linkTitle: "Implementation and Concepts - Scopes and Symbols" weight: 20 no_list: false menu: @@ -102,3 +102,5 @@ var name = parseName("std::string") // This will return all the 'string' symbols within the 'std' name scope var stringSymbols = scopeManager.findSymbols(name) ``` + +Developers should avoid symbol lookup during frontend parsing, since often during parsing, only a limited view of all symbols is available. Instead, a dedicated pass that is run on the complete translation result is the preferred option. Apart from that, the main usage of this API is in the [SymbolResolver](symbol-resolver.md). \ No newline at end of file diff --git a/docs/docs/CPG/impl/symbol-resolver.md b/docs/docs/CPG/impl/symbol-resolver.md new file mode 100644 index 00000000000..c052e6b69fc --- /dev/null +++ b/docs/docs/CPG/impl/symbol-resolver.md @@ -0,0 +1,38 @@ +--- +title: "Implementation and Concepts - Symbol Resolution" +linkTitle: "Implementation and Concepts - Symbol Resolution" +weight: 20 +no_list: false +menu: + main: + weight: 20 +description: > + The CPG library is a language-agnostic graph representation of source code. +--- + + +# Symbol Resolution + +This pages describes the main functionality behind symbol resolution in the CPG library. This is mostly done by the `SymbolResolver` pass, in combination with the symbol lookup API (see [Scopes and Symbols](scopes.md#looking-up-symbols)). In addition to the *lookup* of a symbol, the *resolution* takes the input of the lookup and provides a "definite" decision which symbol is used. This mostly referred to symbols / names used in a `Reference` or a `CallExpression` (which also has a reference as its `CallExpression::callee`). + +## The `SymbolResolver` Pass + +The `SymbolResolver` pass takes care of the heavy lifting of symbol (or rather reference) resolving: + +* It sets the `Reference::refersTo` property, +* and sets the `CallExpression::invokes` property, +* and finally takes cares of operator overloading (if the language supports it). + +In a way, it can be compared to a linker step in a compiler. The pass operates on a single `Component` and starts by identifying EOG starter nodes within the component. These node "start" an EOG sub-graph, i.e., they do not have any previous EOG edges. The symbol resolver uses the `ScopedWalker` with a special set-up that traverses the EOG starting with each EOG starter node until it reaches the end. This ensures that symbols are resolved in the correct order of "evaluation", e.g., that a base of a member expression is resolved before the expression itself. This ensures that necessary type information on the base are available in order to resolve appropriate fields of the member expression. + +The symbol resolver itself has gone through many re-writes over the years and there is still some code left that we consider *legacy*. These functions are marked as such, and we aim to remove them slowly. + +## Resolving References + +The main functionality lies in `ScopeManager::handleReference`. For all `Reference` nodes (that are not `MemberExpression` nodes) we use the symbol lookup API to find declaration candidates for the name the reference is referring to. This candidate list is then stored in `Reference::candidates`. If the reference is the `CallExpression::callee` property of a call, we abort here and jump to [Resolve Calls](#resolve-calls). + +Otherwise, we currently take the first entry of the candidate list and set the `Reference::refersTo` property to it. + +## Resolve Calls + +Prequisite: The `CallExpression::callee` reference must have been resolved (see [Resolving References](#resolving-references)). \ No newline at end of file diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index b6a535732db..e73c12e7522 100755 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -167,8 +167,9 @@ nav: - "Implementation": - CPG/impl/index.md - "Language Frontends": CPG/impl/language.md - - "Scopes": CPG/impl/scopes.md + - "Scopes and Symbols": CPG/impl/scopes.md - "Passes": CPG/impl/passes.md + - "Symbol Resolution": CPG/impl/symbol-resolver.md - "Contributing": - "Contributing to the CPG library": Contributing/index.md # This assumes that the most recent dokka build was generated with the "main" tag!