From dc37699b254da9db5bff641c6ecda985a8d5f778 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 29 Jul 2024 17:34:39 +0100
Subject: [PATCH] [SYCLLowerIR] Remove !amdgcn.annotations metadata (#14713)

The `!amdgcn.annotations` metadata was a SYCL-specific addition. The
concept of annotations for AMDGPU makes it appear as if it's a mirror of
NVVM annotations, when in fact it's just a kernel tagging mechanism. It
is not a feature supported by AMD's drivers. We don't need to rely on
this, as the functions' calling conventions identify kernels. We also
rely on the "sycl-device" module flag to restrict the passes to SYCL
code.

This patch re-uses the existing `TargetHelpers` namespace to hide the
target-specific logic behind a new class: the `KernelCache`. This
provides a way of maintaining a cache of kernels, with optional
annotation metadata (it could be expanded in the future with more types
of payload). It also provides abstracted ways of handling certain RAUW
operations on kernels, though currently only a minimum required to
support the two existing patterns. The aim of this is to hide all
concept of "annotations" from the passes, and make it an implementation
detail of the `KernelCache`.

During this work, it was noticed that our handling of annotations was
incomplete. NVVM annotations are not required to only only have 3
operands, as the official documentation shows. It's actually a list of
pairs, any one of which may declare the function a kernel. Thus we may
have missed valid kernels. Tests have been added to check for this.

The `GlobalOffset` pass was also treating "unsupported" architectures as
AMDGPU architectures, so that has been tightened up and the tests have
been updated to ensure they actually register as AMD modules.

LIT tests have been cleaned up somewhat, to remove unnecessary features
like comments and function linkage types.

Several LIT tests have been converted to use
the `update_test_checks.py` or `update_llc_test_checks.py` scripts,
where appropriate. These tools cannot currently emit checks for named
metadata nor certain assembly features, so some tests must remain as
they are.
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  39 ----
 clang/test/CodeGenSYCL/kernel-annotation.cpp  |   5 +-
 llvm/docs/AMDGPUUsage.rst                     |  27 ---
 llvm/include/llvm/SYCLLowerIR/GlobalOffset.h  |  25 +--
 .../SYCLLowerIR/LocalAccessorToSharedMemory.h |  10 -
 llvm/include/llvm/SYCLLowerIR/TargetHelpers.h |  55 ++++--
 llvm/lib/SYCLLowerIR/GlobalOffset.cpp         | 105 +++++-----
 .../LocalAccessorToSharedMemory.cpp           |  43 ++---
 llvm/lib/SYCLLowerIR/TargetHelpers.cpp        | 182 ++++++++++--------
 llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll |  78 +++++---
 .../global-offset-intrinsic-function-mix.ll   | 103 +++++-----
 .../AMDGPU/global-offset-invalid-triple.ll    |  20 +-
 ...offset-multiple-calls-from-one-function.ll |  88 ++++-----
 .../global-offset-multiple-entry-points.ll    | 175 ++++++++++-------
 .../CodeGen/AMDGPU/global-offset-removal.ll   |  23 ++-
 .../CodeGen/AMDGPU/global-offset-simple.ll    |  71 ++++---
 ...r-to-shared-memory-basic-transformation.ll |  35 ++--
 ...sor-to-shared-memory-multiple-functions.ll |  44 +++--
 ...cessor-to-shared-memory-no-entry-points.ll |  24 +--
 ...cessor-to-shared-memory-preserves-types.ll |  45 +++--
 .../local-accessor-to-shared-memory-triple.ll |  11 +-
 ...-accessor-to-shared-memory-valid-triple.ll |  37 +---
 .../NVPTX/global-offset-annotations.ll        |  39 ++++
 llvm/test/CodeGen/NVPTX/global-offset-dbg.ll  |  75 ++++++--
 .../global-offset-intrinsic-function-mix.ll   |  27 +--
 .../NVPTX/global-offset-invalid-triple.ll     |   7 +-
 ...offset-multiple-calls-from-one-function.ll |  19 +-
 .../global-offset-multiple-entry-points.ll    |  48 ++---
 .../CodeGen/NVPTX/global-offset-removal.ll    |   5 +-
 .../CodeGen/NVPTX/global-offset-simple.ll     |  22 ++-
 .../NVPTX/global-offset-valid-triple.ll       |   8 +-
 ...l-accessor-to-shared-memory-annotations.ll |  45 +++++
 ...r-to-shared-memory-basic-transformation.ll |  40 ++--
 ...ccessor-to-shared-memory-invalid-triple.ll |   9 +-
 ...r-to-shared-memory-multiple-annotations.ll |  34 ++--
 ...sor-to-shared-memory-multiple-functions.ll |  49 +++--
 ...cessor-to-shared-memory-no-entry-points.ll |  29 ++-
 ...cessor-to-shared-memory-preserves-types.ll |  53 +++--
 .../local-accessor-to-shared-memory-triple.ll |   7 +-
 ...-accessor-to-shared-memory-valid-triple.ll |   8 +-
 .../passes/target/TargetFusionInfo.cpp        |   5 -
 .../promote-private-non-unit-hip.ll           |   3 -
 .../kernel-fusion/check-remapping-amdgpu.ll   |   6 +-
 .../check-remapping-interproc-amdgpu.ll       |   6 +-
 44 files changed, 1000 insertions(+), 789 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/global-offset-annotations.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-annotations.ll

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index f05b95e04c4d8..4d3275e17c386 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -317,12 +317,6 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
   bool shouldEmitStaticExternCAliases() const override;
   bool shouldEmitDWARFBitFieldSeparators() const override;
   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
-
-private:
-  // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
-  // resulting MDNode to the amdgcn.annotations MDNode.
-  static void addAMDGCNMetadata(llvm::GlobalValue *GV, StringRef Name,
-                                int Operand);
 };
 }
 
@@ -404,33 +398,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
   }
 }
 
-/// Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV,
-/// Name, and Operand as operands, and adds the resulting MDNode to the
-/// AnnotationName MDNode.
-static void addAMDGCOrNVVMMetadata(const char *AnnotationName,
-                                   llvm::GlobalValue *GV, StringRef Name,
-                                   int Operand) {
-  llvm::Module *M = GV->getParent();
-  llvm::LLVMContext &Ctx = M->getContext();
-
-  // Get annotations metadata node.
-  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(AnnotationName);
-
-  llvm::Metadata *MDVals[] = {
-      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
-      llvm::ConstantAsMetadata::get(
-          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
-  // Append metadata to annotations node.
-  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
-}
-
-
-void AMDGPUTargetCodeGenInfo::addAMDGCNMetadata(llvm::GlobalValue *GV,
-                                                StringRef Name, int Operand) {
-  addAMDGCOrNVVMMetadata("amdgcn.annotations", GV, Name, Operand);
-}
-
-
 /// Emits control constants used to change per-architecture behaviour in the
 /// AMDGPU ROCm device libraries.
 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
@@ -483,12 +450,6 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   if (FD)
     setFunctionDeclAttributes(FD, F, M);
 
-  // Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.
-  const bool IsSYCLKernel =
-      FD && M.getLangOpts().SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>();
-  if (IsSYCLKernel)
-    addAMDGCNMetadata(F, "kernel", 1);
-
   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
 
diff --git a/clang/test/CodeGenSYCL/kernel-annotation.cpp b/clang/test/CodeGenSYCL/kernel-annotation.cpp
index e2640708969cd..378888da50d09 100644
--- a/clang/test/CodeGenSYCL/kernel-annotation.cpp
+++ b/clang/test/CodeGenSYCL/kernel-annotation.cpp
@@ -23,15 +23,12 @@ class Functor {
 };
 
 // CHECK-SPIR-NOT: annotations =
+// CHECK-AMDGCN-NOT: annotations =
 
 // CHECK-NVPTX: nvvm.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
 // CHECK-NVPTX: [[FIRST]] = !{ptr @_ZTS7Functor, !"kernel", i32 1}
 // CHECK-NVPTX: [[SECOND]] = !{ptr @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E5foo_2, !"kernel", i32 1}
 
-// CHECK-AMDGCN: amdgcn.annotations = !{[[FIRST:![0-9]]], [[SECOND:![0-9]]]}
-// CHECK-AMDGCN: [[FIRST]] = !{ptr @_ZTS7Functor, !"kernel", i32 1}
-// CHECK-AMDGCN: [[SECOND]] = !{ptr @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E5foo_2, !"kernel", i32 1}
-
 int main() {
   sycl::queue q;
   q.submit([&](sycl::handler &cgh) {
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e4cccfa4ffc36..192df32229787 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -15847,33 +15847,6 @@ track the usage for each kernel. However, in some cases careful organization of
 the kernels and functions in the source file means there is minimal additional
 effort required to accurately calculate GPR usage.
 
-SYCL Kernel Metadata
-====================
-
-This section describes the additional metadata that is inserted for SYCL
-kernels. As SYCL is a single source programming model functions can either
-execute on a host or a device (i.e. GPU). Device kernels are akin to kernel
-entry-points in GPU program. To mark an LLVM IR function as a device kernel
-function, we make use of special LLVM metadata. The AMDGCN back-end will look
-for a named metadata node called ``amdgcn.annotations``. This named metadata
-must contain a list of metadata that describe the kernel IR. For our purposes,
-we need to declare a metadata node that assigns the `"kernel"` attribute to the
-LLVM IR function that should be emitted as a SYCL kernel function. These
-metadata nodes take the form:
-
-.. code-block:: text
-
-  !{<function ref>, metadata !"kernel", i32 1}
-
-Consider the metadata generated by global-offset pass, showing a void kernel
-function `example_kernel_with_offset` taking one argument, a pointer to 3 i32
-integers:
-
-.. code-block:: llvm
-
-  !amdgcn.annotations = !{!0}
-  !0 = !{void ([3 x i32]*)* @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
-
 Additional Documentation
 ========================
 
diff --git a/llvm/include/llvm/SYCLLowerIR/GlobalOffset.h b/llvm/include/llvm/SYCLLowerIR/GlobalOffset.h
index 195b1471ad9a0..48a274316a8ba 100644
--- a/llvm/include/llvm/SYCLLowerIR/GlobalOffset.h
+++ b/llvm/include/llvm/SYCLLowerIR/GlobalOffset.h
@@ -24,10 +24,6 @@ class PassRegistry;
 /// with an offset parameter which will be threaded through from the kernel
 /// entry point.
 class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
-private:
-  using KernelPayload = TargetHelpers::KernelPayload;
-  using ArchType = TargetHelpers::ArchType;
-
 public:
   explicit GlobalOffsetPass() {}
 
@@ -41,7 +37,8 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
   /// appended to the name.
   ///
   /// \param Func Kernel to be processed.
-  void processKernelEntryPoint(Function *Func);
+  void processKernelEntryPoint(Function *Func,
+                               TargetHelpers::KernelCache &KCache);
 
   /// For a function containing a call instruction to the implicit offset
   /// intrinsic, or another function which eventually calls the intrinsic,
@@ -65,7 +62,8 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
   /// to have the implicit parameter added to it or be replaced with the
   /// implicit  parameter.
   void addImplicitParameterToCallers(Module &M, Value *Callee,
-                                     Function *CalleeWithImplicitParam);
+                                     Function *CalleeWithImplicitParam,
+                                     TargetHelpers::KernelCache &KCache);
 
   /// For a given function `Func` create a clone and extend its signature to
   /// contain an implicit offset argument.
@@ -89,18 +87,6 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
                               Type *ImplicitArgumentType = nullptr,
                               bool KeepOriginal = false, bool IsKernel = false);
 
-  /// Create a mapping of kernel entry points to their metadata nodes. While
-  /// iterating over kernels make sure that a given kernel entry point has no
-  /// llvm uses.
-  ///
-  /// \param KernelPayloads A collection of kernel functions present in a
-  /// module `M`.
-  ///
-  /// \returns A map of kernel functions to corresponding metadata nodes.
-  DenseMap<Function *, MDNode *>
-  generateKernelMDNodeMap(Module &M,
-                          SmallVectorImpl<KernelPayload> &KernelPayloads);
-
 private:
   /// Keep track of all cloned offset functions to avoid processing them.
   llvm::SmallPtrSet<Function *, 8> Clones;
@@ -109,14 +95,11 @@ class GlobalOffsetPass : public PassInfoMixin<GlobalOffsetPass> {
   /// Keep track of which non-offset functions have been processed to avoid
   /// processing twice.
   llvm::DenseMap<Function *, Value *> ProcessedFunctions;
-  /// Keep a map of all entry point functions with metadata.
-  llvm::DenseMap<Function *, MDNode *> EntryPointMetadata;
   /// A type of implicit argument added to the kernel signature.
   llvm::Type *KernelImplicitArgumentType = nullptr;
   /// A type used for the alloca holding the values of global offsets.
   llvm::Type *ImplicitOffsetPtrType = nullptr;
 
-  ArchType AT;
   unsigned TargetAS = 0;
 };
 
diff --git a/llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h b/llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h
index 9dfd6dc189264..d6d15e8b9b500 100644
--- a/llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h
+++ b/llvm/include/llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h
@@ -25,10 +25,6 @@ class PassRegistry;
 /// functions.
 class LocalAccessorToSharedMemoryPass
     : public PassInfoMixin<LocalAccessorToSharedMemoryPass> {
-private:
-  using KernelPayload = TargetHelpers::KernelPayload;
-  using ArchType = TargetHelpers::ArchType;
-
 public:
   explicit LocalAccessorToSharedMemoryPass() {}
 
@@ -49,12 +45,6 @@ class LocalAccessorToSharedMemoryPass
   /// \returns A new function with global symbol accesses.
   Function *processKernel(Module &M, Function *F);
 
-  /// Update kernel metadata to reflect the change in the signature.
-  ///
-  /// \param A map of original kernels to the modified ones.
-  void postProcessKernels(
-      SmallVectorImpl<std::pair<Function *, KernelPayload>> &NewToOldKernels);
-
 private:
   /// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
   /// to be 3.
diff --git a/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h b/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
index fba50396e6be2..e2a0e917b9dbd 100644
--- a/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
+++ b/llvm/include/llvm/SYCLLowerIR/TargetHelpers.h
@@ -22,21 +22,54 @@ using namespace llvm;
 namespace llvm {
 namespace TargetHelpers {
 
-enum class ArchType { Cuda, AMDHSA, Unsupported };
+struct KernelCache {
+  void populateKernels(Module &M);
 
-struct KernelPayload {
-  KernelPayload(Function *Kernel, MDNode *MD = nullptr);
-  Function *Kernel;
-  MDNode *MD;
-  SmallVector<MDNode *> DependentMDs;
-};
+  bool isKernel(Function &F) const;
+
+  /// Updates cached data with a function intended as a replacement of an
+  /// existing function.
+  void handleReplacedWith(Function &OldF, Function &NewF);
+
+  /// Updates cached data with a new clone of an existing function.
+  /// The KernelOnly parameter updates cached data with only the information
+  /// required to identify the new function as a kernel.
+  void handleNewCloneOf(Function &OldF, Function &NewF, bool KernelOnly);
+
+private:
+  /// Extra data about a kernel function. Only applicable to NVPTX kernels,
+  /// which have associated annotation metadata.
+  struct KernelPayload {
+    explicit KernelPayload() = default;
+    KernelPayload(NamedMDNode *ModuleAnnotationsMD);
+
+    bool hasAnnotations() const { return ModuleAnnotationsMD != nullptr; }
 
-ArchType getArchType(const Module &M);
+    /// ModuleAnnotationsMD - metadata conntaining the unique global list of
+    /// annotations.
+    NamedMDNode *ModuleAnnotationsMD = nullptr;
+    SmallVector<MDNode *> DependentMDs;
+  };
 
-std::string getAnnotationString(ArchType AT);
+  /// List of kernels in original Module order
+  SmallVector<Function *, 4> Kernels;
+  /// Map of kernels to extra data. Also serves as a quick kernel query.
+  SmallDenseMap<Function *, KernelPayload> KernelData;
+
+public:
+  using iterator = decltype(Kernels)::iterator;
+  using const_iterator = decltype(Kernels)::const_iterator;
+
+  iterator begin() { return Kernels.begin(); }
+  iterator end() { return Kernels.end(); }
+
+  const_iterator begin() const { return Kernels.begin(); }
+  const_iterator end() const { return Kernels.end(); }
+
+  bool empty() const { return Kernels.empty(); }
+};
 
-void populateKernels(Module &M, SmallVectorImpl<KernelPayload> &Kernels,
-                     TargetHelpers::ArchType AT);
+bool isSYCLDevice(const Module &M);
 
 } // end namespace TargetHelpers
 } // end namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
index d304ebf488488..3873b2f8837e5 100644
--- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
+++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/SYCLLowerIR/TargetHelpers.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 using namespace llvm;
@@ -72,13 +73,38 @@ static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed,
   }
 }
 
+static void validateKernels(Module &M, TargetHelpers::KernelCache &KCache) {
+  SmallVector<GlobalValue *, 4> Vec;
+  collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/false);
+  collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/true);
+  SmallPtrSet<GlobalValue *, 8u> Used = {Vec.begin(), Vec.end()};
+
+  auto HasUseOtherThanLLVMUsed = [&Used](GlobalValue *GV) {
+    if (GV->use_empty())
+      return false;
+    return !GV->hasOneUse() || !Used.count(GV);
+  };
+
+  for (auto &F : KCache) {
+    if (HasUseOtherThanLLVMUsed(F))
+      llvm_unreachable("Kernel entry point can't have uses.");
+  }
+}
+
 // New PM implementation.
 PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
-  AT = TargetHelpers::getArchType(M);
+  // Only run this pass on SYCL device code
+  if (!TargetHelpers::isSYCLDevice(M))
+    return PreservedAnalyses::all();
+
+  // And only for NVPTX/AMDGCN targets.
+  Triple T(M.getTargetTriple());
+  if (!T.isNVPTX() && !T.isAMDGCN())
+    return PreservedAnalyses::all();
+
   Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName(
-      AT == ArchType::Cuda
-          ? static_cast<unsigned>(Intrinsic::nvvm_implicit_offset)
-          : static_cast<unsigned>(Intrinsic::amdgcn_implicit_offset)));
+      T.isNVPTX() ? static_cast<unsigned>(Intrinsic::nvvm_implicit_offset)
+                  : static_cast<unsigned>(Intrinsic::amdgcn_implicit_offset)));
 
   if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty())
     return PreservedAnalyses::all();
@@ -86,7 +112,7 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
   if (EnableGlobalOffset) {
     // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
     // happy with ADDRESS_SPACE_GENERIC (0).
-    TargetAS = AT == ArchType::Cuda ? 0 : 5;
+    TargetAS = T.isNVPTX() ? 0 : 5;
     /// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
     /// to be 3, use it for the implicit argument pointer type.
     KernelImplicitArgumentType =
@@ -97,14 +123,13 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
         (ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
         "Implicit offset intrinsic does not return the expected type");
 
-    SmallVector<KernelPayload, 4> KernelPayloads;
-    TargetHelpers::populateKernels(M, KernelPayloads, AT);
-
-    // Validate kernels and populate entry map
-    EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);
+    TargetHelpers::KernelCache KCache;
+    KCache.populateKernels(M);
+    // Validate kernels
+    validateKernels(M, KCache);
 
     // Add implicit parameters to all direct and indirect users of the offset
-    addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
+    addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr, KCache);
   }
   SmallVector<CallInst *, 4> Worklist;
   SmallVector<LoadInst *, 4> Loads;
@@ -141,13 +166,11 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
   return PreservedAnalyses::none();
 }
 
-void GlobalOffsetPass::processKernelEntryPoint(Function *Func) {
-  assert(EntryPointMetadata.count(Func) != 0 &&
-         "Function must be an entry point");
-
+void GlobalOffsetPass::processKernelEntryPoint(
+    Function *Func, TargetHelpers::KernelCache &KCache) {
   auto &M = *Func->getParent();
+  Triple T(M.getTargetTriple());
   LLVMContext &Ctx = M.getContext();
-  MDNode *FuncMetadata = EntryPointMetadata[Func];
 
   // Already processed.
   if (ProcessedFunctions.count(Func) == 1)
@@ -155,32 +178,27 @@ void GlobalOffsetPass::processKernelEntryPoint(Function *Func) {
 
   // Add the new argument to all other kernel entry points, despite not
   // using the global offset.
-  auto *KernelMetadata = M.getNamedMetadata(getAnnotationString(AT).c_str());
-  assert(KernelMetadata && "IR compiled must have correct annotations");
-
   auto *NewFunc = addOffsetArgumentToFunction(
                       M, Func, KernelImplicitArgumentType->getPointerTo(),
                       /*KeepOriginal=*/true,
                       /*IsKernel=*/true)
                       .first;
+
   Argument *NewArgument = std::prev(NewFunc->arg_end());
   // Pass byval to the kernel for NVIDIA, AMD's calling convention disallows
   // byval args, use byref.
   auto Attr =
-      AT == ArchType::Cuda
+      T.isNVPTX()
           ? Attribute::getWithByValType(Ctx, KernelImplicitArgumentType)
           : Attribute::getWithByRefType(Ctx, KernelImplicitArgumentType);
   NewArgument->addAttr(Attr);
 
-  // Add the metadata.
-  Metadata *NewMetadata[] = {ConstantAsMetadata::get(NewFunc),
-                             FuncMetadata->getOperand(1),
-                             FuncMetadata->getOperand(2)};
-  KernelMetadata->addOperand(MDNode::get(Ctx, NewMetadata));
+  KCache.handleNewCloneOf(*Func, *NewFunc, /*KernelOnly*/ true);
 }
 
 void GlobalOffsetPass::addImplicitParameterToCallers(
-    Module &M, Value *Callee, Function *CalleeWithImplicitParam) {
+    Module &M, Value *Callee, Function *CalleeWithImplicitParam,
+    TargetHelpers::KernelCache &KCache) {
   SmallVector<User *, 8> Users{Callee->users()};
 
   for (User *U : Users) {
@@ -196,8 +214,8 @@ void GlobalOffsetPass::addImplicitParameterToCallers(
       continue;
 
     // Kernel entry points need additional processing and change Metdadata.
-    if (EntryPointMetadata.count(Caller) != 0)
-      processKernelEntryPoint(Caller);
+    if (KCache.isKernel(*Caller))
+      processKernelEntryPoint(Caller, KCache);
 
     // Determine if `Caller` needs to be processed or if this is another
     // callsite from a non-offset function or an already-processed function.
@@ -249,7 +267,7 @@ void GlobalOffsetPass::addImplicitParameterToCallers(
       continue;
 
     // Process callers of the old function.
-    addImplicitParameterToCallers(M, Caller, NewFunc);
+    addImplicitParameterToCallers(M, Caller, NewFunc, KCache);
   }
 }
 
@@ -309,7 +327,7 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
     // addrspace(3). This is done as kernels can't allocate and fill the
     // array in constant address space.
     // Not required any longer, but left due to deprecatedness.
-    if (IsKernel && AT == ArchType::AMDHSA) {
+    if (IsKernel && Func->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
       BasicBlock *EntryBlock = &NewFunc->getEntryBlock();
       IRBuilder<> Builder(EntryBlock, EntryBlock->getFirstInsertionPt());
       Type *ImplicitOffsetType =
@@ -324,7 +342,7 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
       // are replaced with uses of kernarg.segment.ptr which is in
       // addrspace(4), cast implicit offset arg to constant memory so the
       // memcpy is issued into a correct address space.
-      auto OrigImplicitOffsetAS4 = Builder.CreateAddrSpaceCast(
+      auto *OrigImplicitOffsetAS4 = Builder.CreateAddrSpaceCast(
           OrigImplicitOffset, Type::getInt8Ty(M.getContext())->getPointerTo(4));
       Builder.CreateMemCpy(
           ImplicitOffsetAlloca, ImplicitOffsetAlloca->getAlign(),
@@ -381,28 +399,3 @@ std::pair<Function *, Value *> GlobalOffsetPass::addOffsetArgumentToFunction(
   // Return the new function and the offset argument.
   return {NewFunc, ImplicitOffset};
 }
-
-DenseMap<Function *, MDNode *> GlobalOffsetPass::generateKernelMDNodeMap(
-    Module &M, SmallVectorImpl<KernelPayload> &KernelPayloads) {
-  SmallPtrSet<GlobalValue *, 8u> Used;
-  SmallVector<GlobalValue *, 4> Vec;
-  collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/false);
-  collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/true);
-  Used = {Vec.begin(), Vec.end()};
-
-  auto HasUseOtherThanLLVMUsed = [&Used](GlobalValue *GV) {
-    if (GV->use_empty())
-      return false;
-    return !GV->hasOneUse() || !Used.count(GV);
-  };
-
-  DenseMap<Function *, MDNode *> EntryPointMetadata;
-  for (auto &KP : KernelPayloads) {
-    if (HasUseOtherThanLLVMUsed(KP.Kernel))
-      llvm_unreachable("Kernel entry point can't have uses.");
-
-    EntryPointMetadata[KP.Kernel] = KP.MD;
-  }
-
-  return EntryPointMetadata;
-}
diff --git a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
index febf46177a6c4..220e04de41157 100644
--- a/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
+++ b/llvm/lib/SYCLLowerIR/LocalAccessorToSharedMemory.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/IPO.h"
 
 using namespace llvm;
@@ -53,31 +54,32 @@ ModulePass *llvm::createLocalAccessorToSharedMemoryPassLegacy() {
 // New PM implementation.
 PreservedAnalyses
 LocalAccessorToSharedMemoryPass::run(Module &M, ModuleAnalysisManager &) {
-  const auto AT = TargetHelpers::getArchType(M);
+  // Only run this pass on SYCL device code
+  if (!TargetHelpers::isSYCLDevice(M))
+    return PreservedAnalyses::all();
 
-  // Invariant: This pass is only intended to operate on SYCL kernels being
-  // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
-  // triples.
-  if (ArchType::Unsupported == AT)
+  // And only for NVPTX/AMDGCN targets.
+  Triple T(M.getTargetTriple());
+  if (!T.isNVPTX() && !T.isAMDGCN())
     return PreservedAnalyses::all();
 
-  SmallVector<KernelPayload, 4> Kernels;
-  TargetHelpers::populateKernels(M, Kernels, AT);
-  SmallVector<std::pair<Function *, KernelPayload>, 4> NewToOldKernels;
-  if (Kernels.empty())
+  TargetHelpers::KernelCache KCache;
+  KCache.populateKernels(M);
+  if (KCache.empty())
     return PreservedAnalyses::all();
 
+  DenseMap<Function *, Function *> NewToOldKernels;
   // Process the function and if changed, update the metadata.
-  for (const auto &K : Kernels) {
-    auto *NewKernel = processKernel(M, K.Kernel);
-    if (NewKernel)
-      NewToOldKernels.push_back(std::make_pair(NewKernel, K));
+  for (const auto &F : KCache) {
+    if (auto *NewKernel = processKernel(M, F))
+      NewToOldKernels[NewKernel] = F;
   }
 
   if (NewToOldKernels.empty())
     return PreservedAnalyses::all();
 
-  postProcessKernels(NewToOldKernels);
+  for (auto &[NewF, F] : NewToOldKernels)
+    KCache.handleReplacedWith(*F, *NewF);
 
   return PreservedAnalyses::none();
 }
@@ -204,16 +206,3 @@ Function *LocalAccessorToSharedMemoryPass::processKernel(Module &M,
 
   return NF;
 }
-
-void LocalAccessorToSharedMemoryPass::postProcessKernels(
-    SmallVectorImpl<std::pair<Function *, KernelPayload>> &NewToOldKernels) {
-  for (auto &Pair : NewToOldKernels) {
-    auto KP = std::get<1>(Pair);
-    auto *F = std::get<0>(Pair);
-    KP.MD->replaceOperandWith(0, llvm::ConstantAsMetadata::get(F));
-    // The MD node of the kernel has been altered, make sure that all the
-    // dependent nodes are kept up to date.
-    for (MDNode *D : KP.DependentMDs)
-      D->replaceOperandWith(0, llvm::ConstantAsMetadata::get(F));
-  }
-}
diff --git a/llvm/lib/SYCLLowerIR/TargetHelpers.cpp b/llvm/lib/SYCLLowerIR/TargetHelpers.cpp
index 8c45148d181e4..2ec12616e6b33 100644
--- a/llvm/lib/SYCLLowerIR/TargetHelpers.cpp
+++ b/llvm/lib/SYCLLowerIR/TargetHelpers.cpp
@@ -13,105 +13,135 @@
 #include "llvm/SYCLLowerIR/TargetHelpers.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
 
 namespace llvm {
 namespace TargetHelpers {
 
-KernelPayload::KernelPayload(Function *Kernel, MDNode *MD)
-    : Kernel(Kernel), MD(MD) {}
+KernelCache::KernelPayload::KernelPayload(NamedMDNode *ModuleAnnotationsMD)
+    : ModuleAnnotationsMD(ModuleAnnotationsMD) {}
 
-ArchType getArchType(const Module &M) {
-  return StringSwitch<ArchType>(M.getTargetTriple().c_str())
-      .Case("nvptx64-nvidia-cuda", ArchType::Cuda)
-      .Case("nvptx-nvidia-cuda", ArchType::Cuda)
-      .Case("amdgcn-amd-amdhsa", ArchType::AMDHSA)
-      .Case("amdgcn--amdhsa", ArchType::AMDHSA)
-      .Default(ArchType::Unsupported);
+bool KernelCache::isKernel(Function &F) const {
+  return KernelData.contains(&F);
 }
 
-std::string getAnnotationString(ArchType AT) {
-  switch (AT) {
-  case TargetHelpers::ArchType::Cuda:
-    return std::string("nvvm.annotations");
-    break;
-  case TargetHelpers::ArchType::AMDHSA:
-    return std::string("amdgcn.annotations");
-    break;
-  default:
-    llvm_unreachable("Unsupported arch type.");
+void KernelCache::handleReplacedWith(Function &OldF, Function &NewF) {
+  assert(KernelData.contains(&OldF) && "Unknown kernel");
+  if (auto &KP = KernelData[&OldF]; KP.hasAnnotations()) {
+    // Make sure that all dependent annotation nodes are kept up to date.
+    for (MDNode *D : KP.DependentMDs)
+      D->replaceOperandWith(0, ConstantAsMetadata::get(&NewF));
   }
-  return std::string();
 }
 
-void populateKernels(Module &M, SmallVectorImpl<KernelPayload> &Kernels,
-                     ArchType AT) {
-  // Access `{amdgcn|nvvm}.annotations` to determine which functions are kernel
-  // entry points.
-  std::string Annotation = getAnnotationString(AT);
-  auto *AnnotationMetadata = M.getNamedMetadata(Annotation);
-  // No kernels in the module, early exit.
-  if (!AnnotationMetadata)
-    return;
-
-  SmallVector<MDNode *, 4> PossibleDependencies;
-  // It is possible that the annotations node contains multiple pointers to the
-  // same metadata, recognise visited ones.
-  SmallSet<MDNode *, 4> Visited;
-  for (auto *MetadataNode : AnnotationMetadata->operands()) {
-    if (Visited.contains(MetadataNode) || MetadataNode->getNumOperands() != 3)
-      continue;
-
-    Visited.insert(MetadataNode);
-
-    // Kernel entry points are identified using metadata nodes of the form:
-    //   !X = !{<function>, !"kernel", i32 1}
-    auto *Type = dyn_cast<MDString>(MetadataNode->getOperand(1));
-    if (!Type)
-      continue;
-    // Only process kernel entry points,
-    if (Type->getString() != "kernel") {
-      // but keep track of other nodes that point to the same function.
-      PossibleDependencies.push_back(MetadataNode);
-      continue;
+void KernelCache::handleNewCloneOf(Function &OldF, Function &NewF,
+                                   bool KernelOnly) {
+  assert(KernelData.contains(&OldF) && "Unknown kernel");
+  if (auto &KP = KernelData[&OldF]; KP.hasAnnotations()) {
+    if (KernelOnly) {
+      // We know this is a kernel, so add a single "kernel" annotation.
+      auto &Ctx = OldF.getContext();
+      Metadata *NewKernelMD[] = {
+          ConstantAsMetadata::get(&NewF), MDString::get(Ctx, "kernel"),
+          ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
+      KP.ModuleAnnotationsMD->addOperand(MDNode::get(Ctx, NewKernelMD));
+    } else {
+      // Otherwise we'll need to clone all metadata, possibly dropping ones
+      // which we can't assume are safe to clone.
+      llvm_unreachable("Unimplemented cloning logic");
     }
+  }
+}
+
+void KernelCache::populateKernels(Module &M) {
+  Triple T(M.getTargetTriple());
 
-    // Get a pointer to the entry point function from the metadata.
-    const MDOperand &FuncOperand = MetadataNode->getOperand(0);
-    if (!FuncOperand)
-      continue;
-    if (auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand))
-      if (auto *Func = dyn_cast<Function>(FuncConstant->getValue()))
-        Kernels.push_back(KernelPayload(Func, MetadataNode));
+  // AMDGPU kernels are identified by their calling convention, and don't have
+  // any annotations.
+  if (T.isAMDGCN()) {
+    for (auto &F : M) {
+      if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+        Kernels.push_back(&F);
+        KernelData[&F] = KernelPayload{};
+      }
+    }
+    return;
   }
 
-  // We need to match non-kernel metadata nodes using the kernel name to the
-  // kernel nodes. To avoid checking matched nodes multiple times keep track of
-  // handled entries.
-  SmallSet<unsigned, 4> HandledNodes;
-  for (auto &KP : Kernels) {
-    auto *KernelConstant = cast<ConstantAsMetadata>(KP.MD->getOperand(0));
-    auto KernelName =
-        cast<Function>(KernelConstant->getValue())->getFunction().getName();
-    for (unsigned I = 0; I < PossibleDependencies.size(); ++I) {
-      if (HandledNodes.contains(I))
+  // NVPTX kernels are identified by the global annotations metadata.
+  if (T.isNVPTX()) {
+    // Access `nvvm.annotations` to determine which functions are kernel
+    // entry points.
+    auto *AnnotationMetadata = M.getNamedMetadata("nvvm.annotations");
+    // No kernels in the module, early exit.
+    if (!AnnotationMetadata)
+      return;
+
+    // It is possible that the annotations node contains multiple pointers to
+    // the same metadata, recognise visited ones.
+    SmallSet<MDNode *, 4> Visited;
+    DenseMap<Function *, SmallVector<MDNode *, 4>> DependentMDNodes;
+
+    for (auto *MDN : AnnotationMetadata->operands()) {
+      if (Visited.contains(MDN) || MDN->getNumOperands() % 2 != 1)
         continue;
-      MDNode *Dep = PossibleDependencies[I];
-      const MDOperand &FuncOperand = Dep->getOperand(0);
+
+      Visited.insert(MDN);
+
+      // Kernel entry points are identified using metadata nodes of the form:
+      //   !X = !{<function>[, !"kind", i32 X]+}
+      // Where "kind" == "kernel" and X == 1.
+      bool IsKernel = false;
+      for (size_t I = 1, E = MDN->getNumOperands() - 1; I < E && !IsKernel;
+           I += 2) {
+        if (auto *Type = dyn_cast<MDString>(MDN->getOperand(I)))
+          if (Type->getString() == "kernel") {
+            if (auto *Val =
+                    mdconst::dyn_extract<ConstantInt>(MDN->getOperand(I + 1)))
+              IsKernel = Val->getZExtValue() == 1;
+          }
+      }
+
+      // Get a pointer to the entry point function from the metadata.
+      const MDOperand &FuncOperand = MDN->getOperand(0);
       if (!FuncOperand)
         continue;
-      if (auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand))
-        if (auto *Func = dyn_cast<Function>(FuncConstant->getValue()))
-          // We've found a match, append the dependent node to the kernel
-          // payload and keep track of matched entries.
-          if (KernelName == Func->getFunction().getName()) {
-            KP.DependentMDs.push_back(Dep);
-            HandledNodes.insert(I);
+
+      if (auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand)) {
+        if (auto *Func = dyn_cast<Function>(FuncConstant->getValue())) {
+          if (IsKernel && !KernelData.contains(Func)) {
+            Kernels.push_back(Func);
+            KernelData[Func] = KernelPayload{AnnotationMetadata};
           }
+          DependentMDNodes[Func].push_back(MDN);
+        }
+      }
     }
+
+    // We need to match non-kernel metadata nodes using the kernel name to the
+    // kernel nodes. To avoid checking matched nodes multiple times keep track
+    // of handled entries.
+    SmallPtrSet<MDNode *, 4> HandledNodes;
+    for (auto &[F, KP] : KernelData) {
+      for (MDNode *DepMDN : DependentMDNodes[F]) {
+        if (HandledNodes.insert(DepMDN).second)
+          KP.DependentMDs.push_back(DepMDN);
+      }
+    }
+  }
+}
+
+bool isSYCLDevice(const Module &M) {
+  if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("sycl-device"))) {
+    return Flag->getZExtValue() == 1;
   }
+  return false;
 }
 
 } // namespace TargetHelpers
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll b/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
index 27d23ced03eb7..3a55c50e8cb4c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-dbg.ll
@@ -1,6 +1,5 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'simple_debug.bc'
-source_filename = "global-offset-dbg.ll"
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
@@ -8,10 +7,8 @@ target triple = "amdgcn-amd-amdhsa"
 ; This test checks that debug information on functions and callsites are preserved
 
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: llvm.amdgcn.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
+define i64 @_ZTS14other_function() !dbg !11 {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
@@ -19,41 +16,70 @@ define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
   ret i64 %4
 }
 
-; CHECK:  weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %0) !dbg !14 {
-
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() !dbg !14 {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel() !dbg !15 {
+define amdgpu_kernel void @_ZTS14example_kernel() !dbg !14 {
 entry:
   %0 = call i64 @_ZTS14other_function(), !dbg !15
-; CHECK: %0 = call i64 @_ZTS14other_function(), !dbg !16
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byref([3 x i32]) %0) !dbg !17 {
-; CHECK: call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %1), !dbg !18
-
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4}
-!amdgcn.annotations = !{!5, !6, !7, !6, !8, !8, !8, !8, !9, !9, !8}
+!llvm.module.flags = !{!3, !4, !5}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 0.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
 !1 = !DIFile(filename: "global-offset-debug.cpp", directory: "/tmp")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!6 = !{i32 1, i32 4}
-!7 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!8 = !{null, !"align", i32 16}
-!9 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!5 = !{i32 1, !"sycl-device", i32 1}
 !11 = distinct !DISubprogram(name: "other_function", scope: !1, file: !1, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
 !12 = !DISubroutineType(types: !13)
 !13 = !{null}
 !14 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
 !15 = !DILocation(line: 1, column: 2, scope: !14)
-; CHECK: !14 = distinct !DISubprogram(name: "other_function", scope: !1, file: !1, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) 
-; CHECK: !15 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: !16 = !DILocation(line: 1, column: 2, scope: !15)
-; CHECK: !17 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: !18 = !DILocation(line: 1, column: 2, scope: !17)
+; CHECK-LABEL: define i64 @_ZTS14other_function(
+; CHECK-SAME: ) !dbg [[DBG5:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14other_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) !dbg [[DBG8:![0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel(
+; CHECK-SAME: ) !dbg [[DBG9:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS14other_function(), !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel_with_offset(
+; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) !dbg [[DBG11:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4), !dbg [[DBG12]]
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) [[TMP1]]), !dbg [[DBG12]]
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "global-offset-debug.cpp", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Dwarf Version", i32 4}
+; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK: [[DBG5]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META6:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
+; CHECK: [[META7]] = !{null}
+; CHECK: [[DBG8]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META6]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG9]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META6]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG10]] = !DILocation(line: 1, column: 2, scope: [[DBG9]])
+; CHECK: [[DBG11]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META6]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG12]] = !DILocation(line: 1, column: 2, scope: [[DBG11]])
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-intrinsic-function-mix.ll b/llvm/test/CodeGen/AMDGPU/global-offset-intrinsic-function-mix.ll
index e80066dc5217a..b208dd2d9c32a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-intrinsic-function-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-intrinsic-function-mix.ll
@@ -1,78 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'intrinsic-function-mix.bc'
-source_filename = "global-offset-intrinsic-function-mix.ll"
 
-target datalayout =
-"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
 ; This test checks that the pass works with functions containing calls to the
 ; intrinsic and calls to other functions that call the intrinsic
 
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: llvm.amdgcn.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS15other_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15other_function() {
+define i64 @_ZTS15other_function() {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
   %4 = zext i32 %3 to i64
-; CHECK: %1 = zext i32 0 to i64
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15other_function_with_offset(ptr addrspace(5) %0) {
-; CHECK: %2 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 2
-; CHECK: %3 = load i32, ptr addrspace(5) %2, align 4
-; CHECK: %4 = zext i32 %3 to i64
-; CHECK: ret i64 %4
-; CHECK: }
-
-define weak_odr dso_local i64 @_ZTS14mixed_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14mixed_function() {
+define i64 @_ZTS14mixed_function() {
   %1 = call i64 @_ZTS15other_function()
-; CHECK: %1 = call i64 @_ZTS15other_function()
   %2 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %3 = getelementptr inbounds i32, ptr addrspace(5) %2, i64 2
   %4 = load i32, ptr addrspace(5) %3, align 4
   %5 = zext i32 %4 to i64
-; CHECK: %2 = zext i32 0 to i64
   ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14mixed_function_with_offset(ptr addrspace(5) %0) {
-; CHECK: %2 = call i64 @_ZTS15other_function_with_offset(ptr addrspace(5) %0)
-; CHECK: %3 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 2
-; CHECK: %4 = load i32, ptr addrspace(5) %3, align 4
-; CHECK: %5 = zext i32 %4 to i64
-; CHECK: ret i64 %2
-
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS12some_kernel() {
+define amdgpu_kernel void @_ZTS12some_kernel() {
 entry:
   %0 = call i64 @_ZTS14mixed_function()
-; CHECK: %0 = call i64 @_ZTS14mixed_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS12some_kernel_with_offset(ptr byref([3 x i32]) %0) {
-; CHECK: entry:
-; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
-; CHECK:   %2 = addrspacecast ptr %0 to ptr addrspace(4)
-; CHECK:   call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %1, ptr addrspace(4) align 1 %2, i64 12, i1 false)
-; CHECK:   %3 = call i64 @_ZTS14mixed_function_with_offset(ptr addrspace(5) %1)
-; CHECK:   ret void
-; CHECK: }
-
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS12some_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!5 = !{i32 1, i32 4}
-; CHECK: !6 = !{ptr @_ZTS12some_kernel_with_offset, !"kernel", i32 1}
+!0 = !{i32 1, !"sycl-device", i32 1}
+; CHECK-LABEL: define i64 @_ZTS15other_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15other_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14mixed_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_ZTS15other_function()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14mixed_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_ZTS15other_function_with_offset(ptr addrspace(5) [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS12some_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS14mixed_function()
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS12some_kernel_with_offset(
+; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @_ZTS14mixed_function_with_offset(ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll b/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll
index 9d58c72fce71c..d53887c0d9045 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-invalid-triple.ll
@@ -1,15 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+; This test checks that the pass does not run on invalid triples/cpus.
 ; RUN: not --crash llc -march=amdgcn -mcpu=hawaii %s -o - 2>&1 | FileCheck %s
-; ModuleID = 'global-offset-invalid-triple.bc'
+
 ; CHECK: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.implicit.offset
-source_filename = "global-offset-invalid-triple.ll"
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 
-; This test checks that the pass does not run on nvcl triples.
-
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
@@ -17,17 +16,12 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
   ret i64 %4
 }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!0 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll
index 872906459f8ef..a1942fd5bd38c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-calls-from-one-function.ll
@@ -1,6 +1,5 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'global-offset-multiple-calls-from-one-function.bc'
-source_filename = "global-offset-multiple-calls-from-one-function.ll"
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
@@ -12,64 +11,65 @@ target triple = "amdgcn-amd-amdhsa"
 ; all calls redirected to the corresponding variants.
 
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5)* @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
   %4 = zext i32 %3 to i64
-; CHECK %1 = zext i32 0 to i64
-
   %5 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5)* @llvm.amdgcn.implicit.offset()
   %6 = getelementptr inbounds i32, ptr addrspace(5) %5, i64 2
   %7 = load i32, ptr addrspace(5) %6, align 4
   %8 = zext i32 %7 to i64
-; CHECK: %2 = zext i32 0 to i64
-
   ret i64 %4
-; CHECK: ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %0) {
-; CHECK: %2 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 2
-; CHECK: %3 = load i32, ptr addrspace(5) %2, align 4
-; CHECK: %4 = zext i32 %3 to i64
-; CHECK: %5 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 2
-; CHECK: %6 = load i32, ptr addrspace(5) %5, align 4
-; CHECK: %7 = zext i32 %6 to i64
-; CHECK: ret i64 %4
-; CHECK: }
-
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define amdgpu_kernel void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
-; CHECK: %0 = call i64 @_ZTS14other_function()
   %1 = call i64 @_ZTS14other_function()
-; CHECK: %1 = call i64 @_ZTS14other_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byref([3 x i32]) %0) {
-; CHECK: entry:
-; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
-; CHECK:   %2 = addrspacecast ptr %0 to ptr addrspace(4)
-; CHECK:   call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %1, ptr addrspace(4) align 1 %2, i64 12, i1 false)
-; CHECK:   %3 = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %1)
-; CHECK:   %4 = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %1)
-; CHECK:   ret void
-; CHECK: }
-
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
-; CHECK: !amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-; CHECK: !5 = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+!0 = !{i32 1, !"sycl-device", i32 1}
+; CHECK-LABEL: define i64 @_ZTS14other_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14other_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS14other_function()
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_ZTS14other_function()
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel_with_offset(
+; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll
index 81cc05f5dc6a5..376779b76b09b 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-multiple-entry-points.ll
@@ -1,6 +1,5 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'global-offset-multiple-entry-points.bc'
-source_filename = "global-offset-multiple-entry-points.ll"
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
@@ -8,118 +7,148 @@ target triple = "amdgcn-amd-amdhsa"
 ; This test checks that the pass works with multiple entry points.
 
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
 
 ; This function is a kernel entry point that does not use global offset. It will
 ; not get a clone with a global offset parameter.
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS12third_kernel() {
+define amdgpu_kernel void @_ZTS12third_kernel() {
 entry:
   ret void
 }
 
-define weak_odr dso_local i64 @_ZTS15common_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15common_function() {
+define i64 @_ZTS15common_function() {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
   %4 = zext i32 %3 to i64
-; CHECK: %1 = zext i32 0 to i64
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15common_function_with_offset(ptr addrspace(5) %0) {  
-; CHECK: %2 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 2
-; CHECK: %3 = load i32, ptr addrspace(5) %2, align 4
-; CHECK: %4 = zext i32 %3 to i64
-; CHECK: ret i64 %4
-; CHECK: }
 
-define weak_odr dso_local i64 @_ZTS14first_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14first_function() {
+define i64 @_ZTS14first_function() {
   %1 = call i64 @_ZTS15common_function()
-; CHECK: %1 = call i64 @_ZTS15common_function()
   ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14first_function_with_offset(ptr addrspace(5) %0) {
-; CHECK: %2 = call i64 @_ZTS15common_function_with_offset(ptr addrspace(5) %0)
-; CHECK: ret i64 %2
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS12first_kernel() {
+define amdgpu_kernel void @_ZTS12first_kernel() {
 entry:
   %0 = call i64 @_ZTS14first_function()
-; CHECK: %0 = call i64 @_ZTS14first_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS12first_kernel_with_offset(ptr byref([3 x i32]) %0) {
-; CHECK: entry:
-; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
-; CHECK:   %2 = addrspacecast ptr %0 to ptr addrspace(4)
-; CHECK:   call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %1, ptr addrspace(4) align 1 %2, i64 12, i1 false)
-; CHECK:   %3 = call i64 @_ZTS14first_function_with_offset(ptr addrspace(5) %1)
-; CHECK:   ret void
-; CHECK: }
-
-define weak_odr dso_local i64 @_ZTS15second_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15second_function() {
+
+define i64 @_ZTS15second_function() {
   %1 = call i64 @_ZTS15common_function()
-; CHECK: %1 = call i64 @_ZTS15common_function()
   ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15second_function_with_offset(ptr addrspace(5) %0) { 
-; CHECK: %2 = call i64 @_ZTS15common_function_with_offset(ptr addrspace(5) %0)
-; CHECK: ret i64 %2
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS13second_kernel() {
+define amdgpu_kernel void @_ZTS13second_kernel() {
 entry:
   %0 = call i64 @_ZTS15second_function()
-; CHECK: %0 = call i64 @_ZTS15second_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS13second_kernel_with_offset(ptr byref([3 x i32]) %0) {
-; CHECK: entry:
-; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
-; CHECK:   %2 = addrspacecast ptr %0 to ptr addrspace(4)
-; CHECK:   call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %1, ptr addrspace(4) align 1 %2, i64 12, i1 false)
-; CHECK:   %3 = call i64 @_ZTS15second_function_with_offset(ptr addrspace(5) %1)
-; CHECK:   ret void
-; CHECK: }
 
 ; This function doesn't get called by a kernel entry point.
-define weak_odr dso_local i64 @_ZTS15no_entry_point() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point() {
+define i64 @_ZTS15no_entry_point() {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
   %4 = zext i32 %3 to i64
-; CHECK: %1 = zext i32 0 to i64
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point_with_offset(ptr addrspace(5) %0) {
-; CHECK: %2 = getelementptr inbounds i32, ptr addrspace(5)  %0, i64 2
-; CHECK: %3 = load i32, ptr addrspace(5) %2, align 4
-; CHECK: %4 = zext i32 %3 to i64
-; CHECK: ret i64 %4
-; CHECK: }
-
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6}
-; CHECK: !amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6, !7, !8}
-
-!0 = distinct !{ptr @_ZTS12first_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!5 = distinct !{ptr @_ZTS13second_kernel, !"kernel", i32 1}
-!6 = distinct !{ptr @_ZTS12third_kernel, !"kernel", i32 1}
-; CHECK: !7 = !{ptr @_ZTS13second_kernel_with_offset, !"kernel", i32 1}
-; CHECK: !8 = !{ptr @_ZTS12first_kernel_with_offset, !"kernel", i32 1}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"sycl-device", i32 1}
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS12third_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15common_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15common_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14first_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_ZTS15common_function()
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14first_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_ZTS15common_function_with_offset(ptr addrspace(5) [[TMP0]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS12first_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS14first_function()
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS12first_kernel_with_offset(
+; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @_ZTS14first_function_with_offset(ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15second_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_ZTS15common_function()
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15second_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @_ZTS15common_function_with_offset(ptr addrspace(5) [[TMP0]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS13second_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS15second_function()
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS13second_kernel_with_offset(
+; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @_ZTS15second_function_with_offset(ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15no_entry_point() {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS15no_entry_point_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll
index ccc06cf248a68..be7cbfed7c2d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll
@@ -1,20 +1,27 @@
-; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
 
 ; This test checks that the implicit offset intrinsic is correctly removed
 
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: llvm.amdgcn.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS14example_kernel() {
+define i64 @_ZTS14example_kernel() {
 entry:
-; CHECK-NOT: @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: getelementptr
-; CHECK-NOT: load
-; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
-; CHECK: ret i{{[0-9]+}} [[REG]]
   %0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1
   %2 = load i32, ptr addrspace(5) %1, align 4
   %3 = zext i32 %2 to i64
   ret i64 %3
 }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"sycl-device", i32 1}
+; CHECK-LABEL: define i64 @_ZTS14example_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll b/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll
index 1e46f0dd6aeaf..7b66946d0a1d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-simple.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
 ; ModuleID = 'global-offset-simple.bc'
 source_filename = "global-offset-simple.ll"
@@ -8,48 +9,56 @@ target triple = "amdgcn-amd-amdhsa"
 ; This test checks that the transformation is applied in the basic case.
 
 declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: llvm.amdgcn.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
   %1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK-NOT: tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
   %2 = getelementptr inbounds i32, ptr addrspace(5) %1, i64 2
   %3 = load i32, ptr addrspace(5) %2, align 4
   %4 = zext i32 %3 to i64
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %0) {
-; CHECK-NOT: tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
-; CHECK: %2 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 2
-; CHECK: %3 = load i32, ptr addrspace(5) %2, align 4
-; CHECK: %4 = zext i32 %3 to i64
-; CHECK: ret i64 %4
-; CHECK: }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define amdgpu_kernel void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
-; CHECK: %0 = call i64 @_ZTS14other_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byref([3 x i32]) %0) {
-; CHECK: entry:
-; CHECK:   %1 = alloca [3 x i32], align 4, addrspace(5)
-; CHECK:   call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %1, ptr addrspace(4) align 1 %2, i64 12, i1 false)
-; CHECK:   %3 = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) %1)
-; CHECK:   ret void
-; CHECK: }
-
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
-; CHECK: !amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
-
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-; CHECK: !5 = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"sycl-device", i32 1}
+; CHECK-LABEL: define i64 @_ZTS14other_function() {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14other_function_with_offset(
+; CHECK-SAME: ptr addrspace(5) [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS14other_function()
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel_with_offset(
+; CHECK-SAME: ptr byref([3 x i32]) [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [3 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(4)
+; CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 [[TMP1]], ptr addrspace(4) align 1 [[TMP2]], i64 12, i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr addrspace(5) [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll
index eb12cc16fa88a..3ccac8c2961a4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-basic-transformation.ll
@@ -1,32 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'basic-transformation.bc'
-source_filename = "basic-transformation.ll"
+
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
 ; This test checks that the transformation is applied in the basic case.
 
-; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
 
-; Function Attrs: noinline
+; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
+;.
 define amdgpu_kernel void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define amdgpu_kernel void @_ZTS14example_kernel(i32 %0, ptr addrspace(1) %b, i32 %c) {
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP1]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
 entry:
-; CHECK: %1 = getelementptr inbounds [0 x i8], ptr  addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %2 = load i32, ptr addrspace(3) %a
   %1 = load i32, ptr addrspace(1) %b
-; CHECK: %3 = load i32, ptr addrspace(1) %b
   %2 = add i32 %c, %c
-; CHECK: %4 = add i32 %c, %c
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!0 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll
index 60f0e1de6d01c..b4455a8738587 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-multiple-functions.ll
@@ -1,39 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'multiple-functions.bc'
-source_filename = "multiple-functions.ll"
+
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
 ; This test checks that the transformation does not break kernels which call other functions.
 
+;.
 ; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
-
+;.
 define void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK-LABEL: define void @_ZTS14other_function(
+; CHECK-SAME: ptr addrspace(3) [[A:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
   %1 = load i32, ptr addrspace(3) %a
-; CHECK: %1 = load i32, ptr addrspace(3) %a
   %2 = load i32, ptr addrspace(1) %b
-; CHECK: %2 = load i32, ptr addrspace(1) %b
   %3 = add i32 %c, %c
-; CHECK: %3 = add i32 %c, %c
   ret void
 }
 
-; Function Attrs: noinline
 define amdgpu_kernel void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define amdgpu_kernel void @_ZTS14example_kernel(i32 %0, ptr addrspace(1) %b, i32 %c) {
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP1]] to ptr addrspace(3)
+; CHECK-NEXT:    call void @_ZTS14other_function(ptr addrspace(3) [[A]], ptr addrspace(1) [[B]], i32 [[C]])
+; CHECK-NEXT:    ret void
+;
 entry:
-; CHECK: %1 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
-   call void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
-; CHECK: call void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
+  call void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!0 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll
index 7d54793e469de..b6d645cbbf794 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-no-entry-points.ll
@@ -1,27 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'no-entry-points.bc'
-source_filename = "no-entry-points.ll"
+
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
 ; This test checks that no transformation is applied when there are no entry points.
 
-; Function Attrs: noinline
 define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: ptr addrspace(3) [[A:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %0 = load i32, ptr addrspace(3) %a
   %1 = load i32, ptr addrspace(1) %b
-; CHECK: %1 = load i32, ptr addrspace(1) %b
   %2 = add i32 %c, %c
-; CHECK: %2 = add i32 %c, %c
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !0, !2, !2, !2, !2, !3, !3, !2}
+!llvm.module.flags = !{!0}
 
-!0 = !{null, !"align", i32 8}
-!1 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!2 = !{null, !"align", i32 16}
-!3 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!0 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll
index 74d8ab3995885..8d5a54f308779 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-preserves-types.ll
@@ -1,37 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'bitcasts.bc'
-source_filename = "bitcasts.ll"
+
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
 ; This test checks that the transformation always bitcasts to the correct type.
 
 ; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
-
-; Function Attrs: noinline
+;.
 define amdgpu_kernel void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
-; CHECK: define amdgpu_kernel void @_ZTS14example_kernel(i32 %0, i32 %1, i32 %2, i32 %3) {
+; CHECK-LABEL: define amdgpu_kernel void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    [[D:%.*]] = bitcast ptr addrspace(3) [[TMP4]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[C:%.*]] = bitcast ptr addrspace(3) [[TMP5]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[B:%.*]] = bitcast ptr addrspace(3) [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP7]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr addrspace(3) [[B]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(3) [[C]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr addrspace(3) [[D]], align 1
+; CHECK-NEXT:    ret void
+;
 entry:
-; CHECK: %4 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %3
-; CHECK: %5 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %2
-; CHECK: %6 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %1
-; CHECK: %7 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %8 = load i32, ptr addrspace(3) %a
   %1 = load i64, ptr addrspace(3) %b
-; CHECK: %9 = load i64, ptr addrspace(3) %b
   %2 = load i16, ptr addrspace(3) %c
-; CHECK: %10 = load i16, ptr addrspace(3) %c
   %3 = load i8, ptr addrspace(3) %d
-; CHECK: %11 = load i8, ptr addrspace(3) %d
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!0 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll
index 1e094bbc91097..aae32968605ef 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-triple.ll
@@ -3,8 +3,6 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-VALID %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal < %s | FileCheck --check-prefix=CHECK-INVALID %s
 
-; ModuleID = 'local-accessor-to-shared-memory-triple.ll'
-source_filename = "local-accessor-to-shared-memory-triple.ll"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
@@ -17,17 +15,12 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK-INVALID: amdpal.pipelines:
 ; CHECK-INVALID-NOT: - .args:
 
-; Function Attrs: noinline
 define amdgpu_kernel void @_ZTS14example_kernel(ptr addrspace(3) %a) {
 entry:
   %0 = load i32, ptr addrspace(3) %a
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!0 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll
index 3f00cd23d6475..0a85f85f317a6 100644
--- a/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-accessor-to-shared-memory-valid-triple.ll
@@ -1,38 +1,21 @@
-; This test checks that the Local Accessor to Shared Memory pass runs with the
-; `amdgcn-amd-amdhsa` triple and does not if the option is not present.
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-OPT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=CHECK-OPT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
 
-; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll'
-source_filename = "local-accessor-to-shared-memory-valid-triple.ll"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn-amd-amdhsa"
 
-; CHECK-OPT: .globl	_ZTS14example_kernel
-; CHECK-OPT: - .args:
-; CHECK-OPT-NOT: .address_space: local
-; CHECK-OPT-NEXT: .offset: 0
-; CHECK-OPT-NEXT: .size: 4
-; CHECK-OPT-NEXT: .value_kind:     by_value
-; Function Attrs: noinline
+; CHECK: .globl	_ZTS14example_kernel
+; CHECK: - .args:
+; CHECK-NOT: .address_space: local
+; CHECK-NEXT: .offset: 0
+; CHECK-NEXT: .size: 4
+; CHECK-NEXT: .value_kind:     by_value
+
 define amdgpu_kernel void @_ZTS14example_kernel(ptr addrspace(3) %a) {
 entry:
   %0 = load i32, ptr addrspace(3) %a
   ret void
 }
 
-!amdgcn.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
-!llvm.ident = !{!7, !8}
-!llvm.module.flags = !{!9, !10}
+!llvm.module.flags = !{!0}
 
-!0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-!1 = !{null, !"align", i32 8}
-!2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
-!3 = !{null, !"align", i32 16}
-!4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-!5 = !{i32 1, i32 2}
-!6 = !{i32 4, i32 100000}
-!7 = !{!"clang version 9.0.0"}
-!8 = !{!"clang version 9.0.0"}
-!9 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
-!10 = !{i32 1, !"wchar_size", i32 4}
+!0 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-annotations.ll b/llvm/test/CodeGen/NVPTX/global-offset-annotations.ll
new file mode 100644
index 0000000000000..720528364bd3d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-annotations.ll
@@ -0,0 +1,39 @@
+; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; This test checks that the transformation is applied to kernels found using
+; less common annotation formats, and that annotations are correctly updated.
+; We don't currently know it's safe to clone all metadata, so only add a
+; "kernel" annotation and leave others in place.
+
+declare ptr @llvm.nvvm.implicit.offset()
+; CHECK-NOT: llvm.nvvm.implicit.offset
+
+define i64 @_ZTS14other_function() {
+  %1 = tail call ptr @llvm.nvvm.implicit.offset()
+  %2 = getelementptr inbounds i32, ptr %1, i64 2
+  %3 = load i32, ptr %2, align 4
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define void @_ZTS14example_kernel() {
+entry:
+  %0 = call i64 @_ZTS14other_function()
+  ret void
+}
+
+; CHECK: !nvvm.annotations = !{![[OLDMD0:[0-9]+]], ![[OLDMD1:[0-9]+]], ![[OLDMD1]], ![[OLDMD0]], ![[NEWKERNELMD:[0-9]+]]}
+
+!llvm.module.flags = !{!0}
+!nvvm.annotations = !{!1, !2, !2, !1}
+
+; CHECK: ![[OLDMD0]] = distinct !{ptr @_ZTS14example_kernel, !"maxnreg", i32 256, !"kernel", i32 1}
+; CHECK: ![[OLDMD1]] = !{ptr @_ZTS14example_kernel, !"maxntidx", i32 8, !"maxntidy", i32 16, !"maxntidz", i32 32}
+; CHECK: ![[NEWKERNELMD]] = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+
+!0 = !{i32 1, !"sycl-device", i32 1}
+!1 = distinct !{ptr @_ZTS14example_kernel, !"maxnreg", i32 256, !"kernel", i32 1}
+!2 = !{ptr @_ZTS14example_kernel, !"maxntidx", i32 8, !"maxntidy", i32 16, !"maxntidz", i32 32}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-dbg.ll b/llvm/test/CodeGen/NVPTX/global-offset-dbg.ll
index 160631ccc1fe7..20bc0b1920dcb 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-dbg.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-dbg.ll
@@ -1,16 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'simple_debug.bc'
-source_filename = "simple_debug.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; This test checks that debug information on functions and callsites are preserved
 
 declare ptr @llvm.nvvm.implicit.offset()
-; CHECK-NOT: llvm.nvvm.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
+define i64 @_ZTS14other_function() !dbg !11 {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
   %3 = load i32, ptr %2, align 4
@@ -18,22 +16,14 @@ define weak_odr dso_local i64 @_ZTS14other_function() !dbg !11 {
   ret i64 %4
 }
 
-; CHECK:  weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr %0) !dbg !14 {
-
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() !dbg !14 {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel() !dbg !15 {
+define void @_ZTS14example_kernel() !dbg !14 {
 entry:
   %0 = call i64 @_ZTS14other_function(), !dbg !15
-; CHECK: %0 = call i64 @_ZTS14other_function(), !dbg !16
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byval([3 x i32]) %0) !dbg !17 {
-; CHECK:   %1 = call i64 @_ZTS14other_function_with_offset(ptr %0), !dbg !18
-
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4}
+!llvm.module.flags = !{!3, !4, !10}
 !nvvm.annotations = !{!5, !6, !7, !6, !8, !8, !8, !8, !9, !9, !8}
 !nvvmir.version = !{!6}
 
@@ -47,13 +37,58 @@ entry:
 !7 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !8 = !{null, !"align", i32 16}
 !9 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!10 = !{i32 1, !"sycl-device", i32 1}
 !11 = distinct !DISubprogram(name: "other_function", scope: !1, file: !1, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
 !12 = !DISubroutineType(types: !13)
 !13 = !{null}
 !14 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
 !15 = !DILocation(line: 1, column: 2, scope: !14)
-; CHECK: !14 = distinct !DISubprogram(name: "other_function", scope: !1, file: !1, line: 3, type: !12, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) 
-; CHECK: !15 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: !16 = !DILocation(line: 1, column: 2, scope: !15)
-; CHECK: !17 = distinct !DISubprogram(name: "example_kernel", scope: !1, file: !1, line: 10, type: !12, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: !18 = !DILocation(line: 1, column: 2, scope: !17)
+; CHECK-LABEL: define i64 @_ZTS14other_function(
+; CHECK-SAME: ) !dbg [[DBG12:![0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+;
+; CHECK-LABEL: define i64 @_ZTS14other_function_with_offset(
+; CHECK-SAME: ptr [[TMP0:%.*]]) !dbg [[DBG15:![0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    ret i64 [[TMP4]]
+;
+;
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: ) !dbg [[DBG16:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @_ZTS14other_function(), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define void @_ZTS14example_kernel_with_offset(
+; CHECK-SAME: ptr byval([3 x i32]) [[TMP0:%.*]]) !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @_ZTS14other_function_with_offset(ptr [[TMP0]]), !dbg [[DBG19:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "global-offset-debug.cpp", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[META3:![0-9]+]] = !{i32 2, !"Dwarf Version", i32 4}
+; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK: [[META5:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META6:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: [[META7:![0-9]+]] = !{i32 1, i32 4}
+; CHECK: [[META8:![0-9]+]] = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+; CHECK: [[META9:![0-9]+]] = !{null, !"align", i32 16}
+; CHECK: [[META10:![0-9]+]] = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: [[META11:![0-9]+]] = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
+; CHECK: [[DBG12]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META13:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META13]] = !DISubroutineType(types: [[META14:![0-9]+]])
+; CHECK: [[META14]] = !{null}
+; CHECK: [[DBG15]] = distinct !DISubprogram(name: "other_function", scope: [[META1]], file: [[META1]], line: 3, type: [[META13]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG16]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META13]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG17]] = !DILocation(line: 1, column: 2, scope: [[DBG16]])
+; CHECK: [[DBG18]] = distinct !DISubprogram(name: "example_kernel", scope: [[META1]], file: [[META1]], line: 10, type: [[META13]], scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG19]] = !DILocation(line: 1, column: 2, scope: [[DBG18]])
+;.
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-intrinsic-function-mix.ll b/llvm/test/CodeGen/NVPTX/global-offset-intrinsic-function-mix.ll
index 0c771f02b5254..9400224f38286 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-intrinsic-function-mix.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-intrinsic-function-mix.ll
@@ -1,6 +1,5 @@
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'intrinsic-function-mix.bc'
-source_filename = "global-offset-intrinsic-function-mix.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
@@ -10,8 +9,8 @@ target triple = "nvptx64-nvidia-cuda"
 declare ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: declare ptr @llvm.nvvm.implicit.offset()
 
-define weak_odr dso_local i64 @_ZTS15other_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15other_function() {
+define i64 @_ZTS15other_function() {
+; CHECK: define i64 @_ZTS15other_function() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
@@ -21,15 +20,15 @@ define weak_odr dso_local i64 @_ZTS15other_function() {
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15other_function_with_offset(ptr %0) {  
+; CHECK: define i64 @_ZTS15other_function_with_offset(ptr %0) {  
 ; CHECK: %2 = getelementptr inbounds i32, ptr %0, i64 2
 ; CHECK: %3 = load i32, ptr %2, align 4
 ; CHECK: %4 = zext i32 %3 to i64
 ; CHECK: ret i64 %4
 ; CHECK: }
 
-define weak_odr dso_local i64 @_ZTS14mixed_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14mixed_function() {
+define i64 @_ZTS14mixed_function() {
+; CHECK: define i64 @_ZTS14mixed_function() {
   %1 = call i64 @_ZTS15other_function()
 ; CHECK: %1 = call i64 @_ZTS15other_function()
   %2 = tail call ptr @llvm.nvvm.implicit.offset()
@@ -41,34 +40,38 @@ define weak_odr dso_local i64 @_ZTS14mixed_function() {
   ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14mixed_function_with_offset(ptr %0) {
+; CHECK: define i64 @_ZTS14mixed_function_with_offset(ptr %0) {
 ; CHECK: %2 = call i64 @_ZTS15other_function_with_offset(ptr %0)
 ; CHECK: %3 = getelementptr inbounds i32, ptr %0, i64 2
 ; CHECK: %4 = load i32, ptr %3, align 4
 ; CHECK: %5 = zext i32 %4 to i64
 ; CHECK: ret i64 %2
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS12some_kernel() {
+define void @_ZTS12some_kernel() {
 entry:
   %0 = call i64 @_ZTS14mixed_function()
 ; CHECK: %0 = call i64 @_ZTS14mixed_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS12some_kernel_with_offset(ptr byval([3 x i32]) %0) {
+; CHECK: define void @_ZTS12some_kernel_with_offset(ptr byval([3 x i32]) %0) {
 ; CHECK: entry:
 ; CHECK:   %1 = call i64 @_ZTS14mixed_function_with_offset(ptr %0)
 ; CHECK:   ret void
 ; CHECK: }
 
+; Check the last annotation is our new kernel
+; CHECK: !nvvm.annotations = {{.*}}, ![[NEWKERNELMD:[0-9]+]]}
+
+!llvm.module.flags = !{!6}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
 !nvvmir.version = !{!5}
 
+; CHECK: ![[NEWKERNELMD]] = !{ptr @_ZTS12some_kernel_with_offset, !"kernel", i32 1}
 !0 = distinct !{ptr @_ZTS12some_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = !{i32 1, i32 4}
-; CHECK: !6 = !{ptr @_ZTS12some_kernel_with_offset, !"kernel", i32 1}
+!6 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
index 753887300dd18..43b1e157ed5c7 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-invalid-triple.ll
@@ -9,7 +9,7 @@ target triple = "nvptx64-nvidia-nvcl"
 
 declare ptr @llvm.nvvm.implicit.offset()
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
   %3 = load i32, ptr %2, align 4
@@ -17,13 +17,13 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
   ret i64 %4
 }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
   ret void
 }
 
+!llvm.module.flags = !{!6}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !nvvmir.version = !{!5}
 
@@ -33,3 +33,4 @@ entry:
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = !{i32 1, i32 4}
+!6 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
index f666cdde2c5bc..f44836f337767 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-calls-from-one-function.ll
@@ -13,8 +13,8 @@ target triple = "nvptx64-nvidia-cuda"
 declare ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: declare ptr @llvm.nvvm.implicit.offset()
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
+; CHECK: define i64 @_ZTS14other_function() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
@@ -33,7 +33,7 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
 ; CHECK: ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr %0) {
+; CHECK: define i64 @_ZTS14other_function_with_offset(ptr %0) {
 ; CHECK: %2 = getelementptr inbounds i32, ptr %0, i64 2
 ; CHECK: %3 = load i32, ptr %2, align 4
 ; CHECK: %4 = zext i32 %3 to i64
@@ -43,8 +43,7 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
 ; CHECK: ret i64 %4
 ; CHECK: }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
 ; CHECK: %0 = call i64 @_ZTS14other_function()
@@ -53,21 +52,25 @@ entry:
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byval([3 x i32]) %0) {
+; CHECK: define void @_ZTS14example_kernel_with_offset(ptr byval([3 x i32]) %0) {
 ; CHECK: entry:
 ; CHECK:   %1 = call i64 @_ZTS14other_function_with_offset(ptr %0)
 ; CHECK:   %2 = call i64 @_ZTS14other_function_with_offset(ptr %0)
 ; CHECK:   ret void
 ; CHECK: }
 
+; Check the last annotation is our new kernel
+; CHECK: !nvvm.annotations = {{.*}}, ![[NEWKERNELMD:[0-9]+]]}
+
+!llvm.module.flags = !{!7}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
-; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
 !nvvmir.version = !{!6}
 
+; CHECK: ![[NEWKERNELMD]] = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-; CHECK: !5 = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
 !6 = !{i32 1, i32 4}
+!7 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
index bddcb7204778c..41c05075eb3db 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-multiple-entry-points.ll
@@ -11,14 +11,13 @@ declare ptr @llvm.nvvm.implicit.offset()
 
 ; This function is a kernel entry point that does not use global offset. It will
 ; not get a clone with a global offset parameter.
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS12third_kernel() {
+define dso_local void @_ZTS12third_kernel() {
 entry:
   ret void
 }
 
-define weak_odr dso_local i64 @_ZTS15common_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15common_function() {
+define i64 @_ZTS15common_function() {
+; CHECK: define i64 @_ZTS15common_function() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
@@ -28,66 +27,64 @@ define weak_odr dso_local i64 @_ZTS15common_function() {
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15common_function_with_offset(ptr %0) {  
+; CHECK: define i64 @_ZTS15common_function_with_offset(ptr %0) {  
 ; CHECK: %2 = getelementptr inbounds i32, ptr %0, i64 2
 ; CHECK: %3 = load i32, ptr %2, align 4
 ; CHECK: %4 = zext i32 %3 to i64
 ; CHECK: ret i64 %4
 ; CHECK: }
 
-define weak_odr dso_local i64 @_ZTS14first_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14first_function() {
+define i64 @_ZTS14first_function() {
+; CHECK: define i64 @_ZTS14first_function() {
   %1 = call i64 @_ZTS15common_function()
 ; CHECK: %1 = call i64 @_ZTS15common_function()
   ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14first_function_with_offset(ptr %0) {
+; CHECK: define i64 @_ZTS14first_function_with_offset(ptr %0) {
 ; CHECK: %2 = call i64 @_ZTS15common_function_with_offset(ptr %0)
 ; CHECK: ret i64 %2
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS12first_kernel() {
+define void @_ZTS12first_kernel() {
 entry:
   %0 = call i64 @_ZTS14first_function()
 ; CHECK: %0 = call i64 @_ZTS14first_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS12first_kernel_with_offset(ptr byval([3 x i32]) %0) {
+; CHECK: define void @_ZTS12first_kernel_with_offset(ptr byval([3 x i32]) %0) {
 ; CHECK: entry:
 ; CHECK:   %1 = call i64 @_ZTS14first_function_with_offset(ptr %0)
 ; CHECK:   ret void
 ; CHECK: }
 
-define weak_odr dso_local i64 @_ZTS15second_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15second_function() {
+define i64 @_ZTS15second_function() {
+; CHECK: define i64 @_ZTS15second_function() {
   %1 = call i64 @_ZTS15common_function()
 ; CHECK: %1 = call i64 @_ZTS15common_function()
   ret i64 %1
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15second_function_with_offset(ptr %0) { 
+; CHECK: define i64 @_ZTS15second_function_with_offset(ptr %0) { 
 ; CHECK: %2 = call i64 @_ZTS15common_function_with_offset(ptr %0)
 ; CHECK: ret i64 %2
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS13second_kernel() {
+define void @_ZTS13second_kernel() {
 entry:
   %0 = call i64 @_ZTS15second_function()
 ; CHECK: %0 = call i64 @_ZTS15second_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS13second_kernel_with_offset(ptr byval([3 x i32]) %0) {
+; CHECK: define void @_ZTS13second_kernel_with_offset(ptr byval([3 x i32]) %0) {
 ; CHECK: entry:
 ; CHECK:   %1 = call i64 @_ZTS15second_function_with_offset(ptr %0)
 ; CHECK:   ret void
 ; CHECK: }
 
 ; This function doesn't get called by a kernel entry point.
-define weak_odr dso_local i64 @_ZTS15no_entry_point() {
-; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point() {
+define i64 @_ZTS15no_entry_point() {
+; CHECK: define i64 @_ZTS15no_entry_point() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
@@ -97,17 +94,23 @@ define weak_odr dso_local i64 @_ZTS15no_entry_point() {
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS15no_entry_point_with_offset(ptr %0) {
+; CHECK: define i64 @_ZTS15no_entry_point_with_offset(ptr %0) {
 ; CHECK: %2 = getelementptr inbounds i32, ptr %0, i64 2
 ; CHECK: %3 = load i32, ptr %2, align 4
 ; CHECK: %4 = zext i32 %3 to i64
 ; CHECK: ret i64 %4
 ; CHECK: }
 
+; Check the last two annotations are our new kernels
+; CHECK: !nvvm.annotations = {{.*}}, ![[NEWKERNEL0MD:[0-9]+]], ![[NEWKERNEL1MD:[0-9]+]]}
+
+!llvm.module.flags = !{!10}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6}
-; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5, !6, !7, !8}
 !nvvmir.version = !{!9}
 
+; CHECK: ![[NEWKERNEL0MD]] = !{ptr @_ZTS13second_kernel_with_offset, !"kernel", i32 1}
+; CHECK: ![[NEWKERNEL1MD]] = !{ptr @_ZTS12first_kernel_with_offset, !"kernel", i32 1}
+
 !0 = distinct !{ptr @_ZTS12first_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
@@ -115,6 +118,5 @@ define weak_odr dso_local i64 @_ZTS15no_entry_point() {
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = distinct !{ptr @_ZTS13second_kernel, !"kernel", i32 1}
 !6 = distinct !{ptr @_ZTS12third_kernel, !"kernel", i32 1}
-; CHECK: !7 = !{ptr @_ZTS13second_kernel_with_offset, !"kernel", i32 1}
-; CHECK: !8 = !{ptr @_ZTS12first_kernel_with_offset, !"kernel", i32 1}
 !9 = !{i32 1, i32 4}
+!10 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll
index da116feede474..53651fff81139 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll
@@ -6,7 +6,7 @@ target triple = "nvptx64-nvidia-cuda"
 declare ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: llvm.nvvm.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS14example_kernel() {
+define i64 @_ZTS14example_kernel() {
 entry:
 ; CHECK-NOT: @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: getelementptr
@@ -19,3 +19,6 @@ entry:
   %3 = zext i32 %2 to i64
   ret i64 %3
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-simple.ll b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
index 8b6546ceece5e..2e1b41e94a25d 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-simple.ll
@@ -1,6 +1,5 @@
 ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s
-; ModuleID = 'simple.bc'
-source_filename = "simple.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
@@ -9,8 +8,8 @@ target triple = "nvptx64-nvidia-cuda"
 declare ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: llvm.nvvm.implicit.offset
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
+; CHECK: define i64 @_ZTS14other_function() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
 ; CHECK-NOT: tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
@@ -19,7 +18,7 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
   ret i64 %4
 }
 
-; CHECK: define weak_odr dso_local i64 @_ZTS14other_function_with_offset(ptr %0) {
+; CHECK: define i64 @_ZTS14other_function_with_offset(ptr %0) {
 ; CHECK-NOT: tail call ptr @llvm.nvvm.implicit.offset()
 ; CHECK: %2 = getelementptr inbounds i32, ptr %0, i64 2
 ; CHECK: %3 = load i32, ptr %2, align 4
@@ -27,28 +26,31 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
 ; CHECK: ret i64 %4
 ; CHECK: }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
 ; CHECK: %0 = call i64 @_ZTS14other_function()
   ret void
 }
 
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel_with_offset(ptr byval([3 x i32]) %0) {
+; CHECK: define void @_ZTS14example_kernel_with_offset(ptr byval([3 x i32]) %0) {
 ; CHECK: entry:
 ; CHECK:   %1 = call i64 @_ZTS14other_function_with_offset(ptr %0)
 ; CHECK:   ret void
 ; CHECK: }
 
+; Check the last annotation is our new kernel
+; CHECK: !nvvm.annotations = {{.*}}, ![[NEWKERNELMD:[0-9]+]]}
+
+!llvm.module.flags = !{!7}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
-; CHECK: !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
 !nvvmir.version = !{!6}
 
+; CHECK: ![[NEWKERNELMD]] = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-; CHECK: !5 = !{ptr @_ZTS14example_kernel_with_offset, !"kernel", i32 1}
 !6 = !{i32 1, i32 4}
+!7 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
index 5b5baa19e787f..3315416bf8985 100644
--- a/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/global-offset-valid-triple.ll
@@ -9,7 +9,7 @@ target triple = "nvptx64-nvidia-cuda"
 
 declare ptr @llvm.nvvm.implicit.offset()
 
-define weak_odr dso_local i64 @_ZTS14other_function() {
+define i64 @_ZTS14other_function() {
   %1 = tail call ptr @llvm.nvvm.implicit.offset()
   %2 = getelementptr inbounds i32, ptr %1, i64 2
   %3 = load i32, ptr %2, align 4
@@ -17,8 +17,7 @@ define weak_odr dso_local i64 @_ZTS14other_function() {
   ret i64 %4
 }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel() {
+define void @_ZTS14example_kernel() {
 entry:
   %0 = call i64 @_ZTS14other_function()
   ret void
@@ -27,7 +26,7 @@ entry:
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !llvm.ident = !{!7, !8}
 !nvvmir.version = !{!9}
-!llvm.module.flags = !{!10, !11}
+!llvm.module.flags = !{!10, !11, !12}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
@@ -41,3 +40,4 @@ entry:
 !9 = !{i32 1, i32 4}
 !10 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
 !11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-annotations.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-annotations.ll
new file mode 100644
index 0000000000000..88bc5481ae3e1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-annotations.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; This test checks that the transformation is applied to kernels found using
+; less common annotation formats, and that annotations are correctly updated.
+
+define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+entry:
+  %0 = load i32, ptr addrspace(3) %a
+  %1 = load i32, ptr addrspace(1) %b
+  %2 = add i32 %c, %c
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !4}
+
+!0 = !{i32 1, !"sycl-device", i32 1}
+!1 = distinct !{ptr @_ZTS14example_kernel, !"maxntidx", i32 256, !"kernel", i32 1, !"maxntidy", i32 64}
+!2 = !{ptr @_ZTS14example_kernel, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!3 = !{ptr @_ZTS14example_kernel, !"maxntidz", i32 256}
+!4 = !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+;.
+; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
+;.
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP1]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META1:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"maxntidx", i32 256, !"kernel", i32 1, !"maxntidy", i32 64}
+; CHECK: [[META2:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+; CHECK: [[META3:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"maxntidz", i32 256}
+; CHECK: [[META4:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+;.
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll
index 8ff198d387dad..e655a83c2da55 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-basic-transformation.ll
@@ -1,6 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'basic-transformation.bc'
-source_filename = "basic-transformation.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
@@ -8,31 +8,47 @@ target triple = "nvptx64-nvidia-cuda"
 ; also makes sure that a non-kernel node using the function's signature gets
 ; correcly updated (`maxntid`).
 
-; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel(i32 %0, ptr addrspace(1) %b, i32 %c) {
+define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
 entry:
-; CHECK: %1 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %2 = load i32, ptr addrspace(3) %a
   %1 = load i32, ptr addrspace(1) %b
-; CHECK: %3 = load i32, ptr addrspace(1) %b
   %2 = add i32 %c, %c
-; CHECK: %4 = add i32 %c, %c
   ret void
 }
 
+!llvm.module.flags = !{!7}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3, !5}
 !nvvmir.version = !{!6}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
-; CHECK: !5 = distinct !{ptr @_ZTS14example_kernel, !"maxntidx", i32 256}
 !5 = !{ptr @_ZTS14example_kernel, !"maxntidx", i32 256}
 !6 = !{i32 1, i32 4}
+!7 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
+;.
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP1]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META1:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: [[META2:![0-9]+]] = !{null, !"align", i32 8}
+; CHECK: [[META3:![0-9]+]] = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+; CHECK: [[META4:![0-9]+]] = !{null, !"align", i32 16}
+; CHECK: [[META5:![0-9]+]] = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: [[META6:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"maxntidx", i32 256}
+; CHECK: [[META7:![0-9]+]] = !{i32 1, i32 4}
+;.
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll
index a3fad590360b8..083dbf49ab1ba 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-invalid-triple.ll
@@ -1,15 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; This test checks that the Local Accessor to Shared Memory pass does not run with the
 ; `nvptx64-nvidia-nvcl` triple.
 ; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck %s
 ; CHECK: .param .u64 .ptr .shared .align 1 _ZTS14example_kernel_param_0
 
-; ModuleID = 'local-accessor-to-shared-memory-invalid-triple.ll'
-source_filename = "local-accessor-to-shared-memory-invalid-triple.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-nvcl"
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a) {
+define void @_ZTS14example_kernel(ptr addrspace(3) %a) {
 entry:
   ret void
 }
@@ -17,7 +15,7 @@ entry:
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !llvm.ident = !{!7, !8}
 !nvvmir.version = !{!9}
-!llvm.module.flags = !{!10, !11}
+!llvm.module.flags = !{!10, !11, !12}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
@@ -31,3 +29,4 @@ entry:
 !9 = !{i32 1, i32 4}
 !10 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
 !11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll
index 638d87991c9c0..48b4fee40a99c 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-annotations.ll
@@ -1,34 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'multiple-annotations.bc'
-source_filename = "multiple-annotations.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; This test checks that the transformation is applied in the basic case with multiple identical annotations nodes.
 
-; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel(i32 %0, ptr addrspace(1) %b, i32 %c) {
+define dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
 entry:
-; CHECK: %1 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %2 = load i32, ptr addrspace(3) %a
   %1 = load i32, ptr addrspace(1) %b
-; CHECK: %3 = load i32, ptr addrspace(1) %b
   %2 = add i32 %c, %c
-; CHECK: %4 = add i32 %c, %c
   ret void
 }
 
+!llvm.module.flags = !{!6}
 !nvvm.annotations = !{!0, !0}
 !nvvmir.version = !{!5}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = !{i32 1, i32 4}
+!6 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
+;.
+; CHECK-LABEL: define dso_local void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP1]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META1:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: [[META2:![0-9]+]] = !{i32 1, i32 4}
+;.
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll
index ead341c3bfe72..c361cd930d7da 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-multiple-functions.ll
@@ -1,42 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'multiple-functions.bc'
-source_filename = "multiple-functions.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; This test checks that the transformation does not break kernels which call other functions.
 
-; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
 
-define weak_odr dso_local void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define weak_odr dso_local void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+define void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
   %1 = load i32, ptr addrspace(3) %a
-; CHECK: %1 = load i32, ptr addrspace(3) %a
   %2 = load i32, ptr addrspace(1) %b
-; CHECK: %2 = load i32, ptr addrspace(1) %b
   %3 = add i32 %c, %c
-; CHECK: %3 = add i32 %c, %c
   ret void
 }
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel(i32 %0, ptr addrspace(1) %b, i32 %c) {
+define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
 entry:
-; CHECK: %1 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
-; CHECK: %a = bitcast ptr addrspace(3) %1 to ptr addrspace(3)
    call void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
-; CHECK: call void @_ZTS14other_function(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
   ret void
 }
 
+!llvm.module.flags = !{!6}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !nvvmir.version = !{!5}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = !{i32 1, i32 4}
+!6 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
+;.
+; CHECK-LABEL: define void @_ZTS14other_function(
+; CHECK-SAME: ptr addrspace(3) [[A:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP1]] to ptr addrspace(3)
+; CHECK-NEXT:    call void @_ZTS14other_function(ptr addrspace(3) [[A]], ptr addrspace(1) [[B]], i32 [[C]])
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META1:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: [[META2:![0-9]+]] = !{null, !"align", i32 8}
+; CHECK: [[META3:![0-9]+]] = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+; CHECK: [[META4:![0-9]+]] = !{null, !"align", i32 16}
+; CHECK: [[META5:![0-9]+]] = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: [[META6:![0-9]+]] = !{i32 1, i32 4}
+;.
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll
index 6af963d51869f..5dd3eb24e4aad 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-no-entry-points.ll
@@ -1,24 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'no-entry-points.bc'
-source_filename = "no-entry-points.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; This test checks that no transformation is applied when there are no entry points.
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: ptr addrspace(3) [[A:%.*]], ptr addrspace(1) [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[B]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %0 = load i32, ptr addrspace(3) %a
   %1 = load i32, ptr addrspace(1) %b
-; CHECK: %1 = load i32, ptr addrspace(1) %b
   %2 = add i32 %c, %c
-; CHECK: %2 = add i32 %c, %c
   ret void
 }
 
+!llvm.module.flags = !{!5}
 !nvvm.annotations = !{!0, !1, !0, !2, !2, !2, !2, !3, !3, !2}
 !nvvmir.version = !{!4}
 
@@ -27,3 +31,12 @@ entry:
 !2 = !{null, !"align", i32 16}
 !3 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !4 = !{i32 1, i32 4}
+!5 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META1:![0-9]+]] = !{null, !"align", i32 8}
+; CHECK: [[META2:![0-9]+]] = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+; CHECK: [[META3:![0-9]+]] = !{null, !"align", i32 16}
+; CHECK: [[META4:![0-9]+]] = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: [[META5:![0-9]+]] = !{i32 1, i32 4}
+;.
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll
index 9dd020c2fb657..61bb349ee57ae 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-preserves-types.ll
@@ -1,43 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -bugpoint-enable-legacy-pm -localaccessortosharedmemory %s -S -o - | FileCheck %s
-; ModuleID = 'bitcasts.bc'
-source_filename = "bitcasts.ll"
+
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
 ; This test checks that the transformation always bitcasts to the correct type.
 
-; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
-; CHECK: define weak_odr dso_local void @_ZTS14example_kernel(i32 %0, i32 %1, i32 %2, i32 %3) {
+;.
+; CHECK: @_ZTS14example_kernel_shared_mem = external addrspace(3) global [0 x i8], align 4
+;.
+define void @_ZTS14example_kernel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
+; CHECK-LABEL: define void @_ZTS14example_kernel(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    [[D:%.*]] = bitcast ptr addrspace(3) [[TMP4]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[C:%.*]] = bitcast ptr addrspace(3) [[TMP5]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[B:%.*]] = bitcast ptr addrspace(3) [[TMP6]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[A:%.*]] = bitcast ptr addrspace(3) [[TMP7]] to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[A]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr addrspace(3) [[B]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr addrspace(3) [[C]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr addrspace(3) [[D]], align 1
+; CHECK-NEXT:    ret void
+;
 entry:
-; CHECK: %4 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %3
-; CHECK: %d = bitcast ptr addrspace(3) %4 to ptr addrspace(3)
-; CHECK: %5 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %2
-; CHECK: %c = bitcast ptr addrspace(3) %5 to ptr addrspace(3)
-; CHECK: %6 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %1
-; CHECK: %b = bitcast ptr addrspace(3) %6 to ptr addrspace(3)
-; CHECK: %7 = getelementptr inbounds [0 x i8], ptr addrspace(3) @_ZTS14example_kernel_shared_mem, i32 0, i32 %0
-; CHECK: %a = bitcast ptr addrspace(3) %7 to ptr addrspace(3)
   %0 = load i32, ptr addrspace(3) %a
-; CHECK: %8 = load i32, ptr addrspace(3) %a
   %1 = load i64, ptr addrspace(3) %b
-; CHECK: %9 = load i64, ptr addrspace(3) %b
   %2 = load i16, ptr addrspace(3) %c
-; CHECK: %10 = load i16, ptr addrspace(3) %c
   %3 = load i8, ptr addrspace(3) %d
-; CHECK: %11 = load i8, ptr addrspace(3) %d
   ret void
 }
 
+!llvm.module.flags = !{!6}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !nvvmir.version = !{!5}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
-; CHECK: !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
 !2 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = !{i32 1, i32 4}
+!6 = !{i32 1, !"sycl-device", i32 1}
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"sycl-device", i32 1}
+; CHECK: [[META1:![0-9]+]] = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
+; CHECK: [[META2:![0-9]+]] = !{null, !"align", i32 8}
+; CHECK: [[META3:![0-9]+]] = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+; CHECK: [[META4:![0-9]+]] = !{null, !"align", i32 16}
+; CHECK: [[META5:![0-9]+]] = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+; CHECK: [[META6:![0-9]+]] = !{i32 1, i32 4}
+;.
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll
index b0c80068434c8..c61805afa6e4c 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-triple.ll
@@ -5,18 +5,16 @@
 ; CHECK-VALID: .param .u32 _ZTS14example_kernel_param_0
 ; CHECK-INVALID: .param .u64 .ptr .shared .align 1 _ZTS14example_kernel_param_0
 
-; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll'
-source_filename = "local-accessor-to-shared-memory-valid-triple.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a) {
+define void @_ZTS14example_kernel(ptr addrspace(3) %a) {
 entry:
   %0 = load i32, ptr addrspace(3) %a
   ret void
 }
 
+!llvm.module.flags = !{!6}
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !nvvmir.version = !{!5}
 
@@ -26,3 +24,4 @@ entry:
 !3 = !{null, !"align", i32 16}
 !4 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
 !5 = !{i32 1, i32 4}
+!6 = !{i32 1, !"sycl-device", i32 1}
diff --git a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll
index 467cea7204f3c..2b36003e6dfd9 100644
--- a/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll
+++ b/llvm/test/CodeGen/NVPTX/local-accessor-to-shared-memory-valid-triple.ll
@@ -4,13 +4,10 @@
 ; RUN: llc -march=nvptx64 -mcpu=sm_20 < %s | FileCheck --check-prefix=CHECK-OPT %s
 ; CHECK-OPT: .param .u32 _ZTS14example_kernel_param_0
 
-; ModuleID = 'local-accessor-to-shared-memory-valid-triple.ll'
-source_filename = "local-accessor-to-shared-memory-valid-triple.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; Function Attrs: noinline
-define weak_odr dso_local void @_ZTS14example_kernel(ptr addrspace(3) %a) {
+define void @_ZTS14example_kernel(ptr addrspace(3) %a) {
 entry:
   %0 = load i32, ptr addrspace(3) %a
   ret void
@@ -19,7 +16,7 @@ entry:
 !nvvm.annotations = !{!0, !1, !2, !1, !3, !3, !3, !3, !4, !4, !3}
 !llvm.ident = !{!7, !8}
 !nvvmir.version = !{!9}
-!llvm.module.flags = !{!10, !11}
+!llvm.module.flags = !{!10, !11, !12}
 
 !0 = distinct !{ptr @_ZTS14example_kernel, !"kernel", i32 1}
 !1 = !{null, !"align", i32 8}
@@ -33,3 +30,4 @@ entry:
 !9 = !{i32 1, i32 4}
 !10 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 0]}
 !11 = !{i32 1, !"wchar_size", i32 4}
+!12 = !{i32 1, !"sycl-device", i32 1}
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index ff1e2e0a352c5..db0aebe392b51 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -825,13 +825,8 @@ class AMDGCNTargetFusionInfo final : public NVPTXAMDGCNTargetFusionInfoBase {
 public:
   using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;
 
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
-    removeDeletedKernelsFromMD("amdgcn.annotations", Funcs);
-  }
-
   void addKernelFunction(Function *KernelFunc) const override {
     KernelFunc->setCallingConv(CallingConv::AMDGPU_KERNEL);
-    addKernelToMD("amdgcn.annotations", KernelFunc);
   }
 
   void createBarrierCall(IRBuilderBase &Builder,
diff --git a/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll b/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll
index b08d7ba472e57..8bff3b1536888 100644
--- a/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll
+++ b/sycl-fusion/test/internalization/promote-private-non-unit-hip.ll
@@ -127,9 +127,6 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 attributes #1 = { nounwind speculatable memory(none) }
 attributes #3 = { "frame-pointer"="all" "target-cpu"="gfx1031" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
 
-!amdgcn.annotations = !{!9}
-
-!9 = !{ptr @fused_0, !"kernel", i32 1}
 !12 = !{!"private", !"none", !"none", !"none", !"none", !"none", !"none", !"none", !"private", !"none", !"none", !"none", !"none", !"none", !"none", !"none"}
 !13 = !{i64 3, !"", !"", !"", !"", !"", !"", !"", i64 3, !"", !"", !"", !"", !"", !"", !""}
 !14 = !{i64 32, !"", !"", !"", !"", !"", !"", !"", i64 1, !"", !"", !"", !"", !"", !"", !""}
diff --git a/sycl-fusion/test/kernel-fusion/check-remapping-amdgpu.ll b/sycl-fusion/test/kernel-fusion/check-remapping-amdgpu.ll
index 2c88036564ccd..91428cae55773 100644
--- a/sycl-fusion/test/kernel-fusion/check-remapping-amdgpu.ll
+++ b/sycl-fusion/test/kernel-fusion/check-remapping-amdgpu.ll
@@ -61,7 +61,8 @@ declare !sycl.kernel.fused !13 !sycl.kernel.nd-ranges !15 !sycl.kernel.nd-range
 ; CHECK: @[[__GLOBAL_OFFSET_REMAPPER_1_10_1_1_10_1_1_3_48_1_1_2_1_1_X__CONST:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(5) constant [3 x i32] zeroinitializer
 
 ; CHECK-LABEL: define amdgpu_kernel void @fused_0(
-; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]]) !work_group_size_hint !3 !kernel_arg_buffer_location !4 !kernel_arg_runtime_aligned !4 !kernel_arg_exclusive_ptr !4 {
+; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]])
+; CHECK-SAME: !work_group_size_hint ![[META0:[0-9]+]] !kernel_arg_buffer_location ![[META1:[0-9]+]] !kernel_arg_runtime_aligned ![[META1]] !kernel_arg_exclusive_ptr ![[META1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__global_linear_id_3_48_1_1_2_1_1() #[[ATTRS:.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 42
@@ -486,7 +487,8 @@ declare !sycl.kernel.fused !13 !sycl.kernel.nd-ranges !15 !sycl.kernel.nd-range
 declare !sycl.kernel.fused !31 !sycl.kernel.nd-ranges !25 !sycl.kernel.nd-range !24 void @fused_kernel_1D()
 
 ; CHECK-LABEL: define amdgpu_kernel void @fused_1(
-; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]]) !work_group_size_hint !3 !kernel_arg_buffer_location !4 !kernel_arg_runtime_aligned !4 !kernel_arg_exclusive_ptr !4 {
+; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]])
+; CHECK-SAME: !work_group_size_hint ![[META0]] !kernel_arg_buffer_location ![[META1]] !kernel_arg_runtime_aligned ![[META1]] !kernel_arg_exclusive_ptr ![[META1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__global_linear_id_3_48_1_1_2_1_1() #[[ATTRS]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 20
diff --git a/sycl-fusion/test/kernel-fusion/check-remapping-interproc-amdgpu.ll b/sycl-fusion/test/kernel-fusion/check-remapping-interproc-amdgpu.ll
index 8b7a44a147099..436ce4ad98b64 100644
--- a/sycl-fusion/test/kernel-fusion/check-remapping-interproc-amdgpu.ll
+++ b/sycl-fusion/test/kernel-fusion/check-remapping-interproc-amdgpu.ll
@@ -126,7 +126,8 @@ entry:
 declare !sycl.kernel.fused !13 !sycl.kernel.nd-ranges !15 !sycl.kernel.nd-range !24 void @fused_kernel()
 
 ; CHECK-LABEL: define amdgpu_kernel void @fused_0(
-; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]]) !work_group_size_hint !3 !kernel_arg_buffer_location !4 !kernel_arg_runtime_aligned !4 !kernel_arg_exclusive_ptr !4 {
+; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]])
+; CHECK-SAME: !work_group_size_hint ![[META0:[0-9]+]] !kernel_arg_buffer_location ![[META1:[0-9]+]] !kernel_arg_runtime_aligned ![[META1]] !kernel_arg_exclusive_ptr ![[META1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__global_linear_id_3_48_1_1_2_1_1() #[[ATTRS:.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 42
@@ -236,7 +237,8 @@ declare !sycl.kernel.fused !13 !sycl.kernel.nd-ranges !15 !sycl.kernel.nd-range
 declare !sycl.kernel.fused !31 !sycl.kernel.nd-ranges !25 !sycl.kernel.nd-range !24 void @fused_kernel_1D()
 
 ; CHECK-LABEL: define amdgpu_kernel void @fused_1(
-; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]]) !work_group_size_hint !3 !kernel_arg_buffer_location !4 !kernel_arg_runtime_aligned !4 !kernel_arg_exclusive_ptr !4 {
+; CHECK-SAME: i32 [[KERNELONE_X:%.*]], i32 [[KERNELONE_X1:%.*]], i32 [[KERNELONE_X2:%.*]])
+; CHECK-SAME: !work_group_size_hint ![[META0:[0-9]+]] !kernel_arg_buffer_location ![[META0:[0-9]+]] !kernel_arg_runtime_aligned ![[META1]] !kernel_arg_exclusive_ptr ![[META1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__global_linear_id_3_48_1_1_2_1_1() #[[ATTRS]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 20